From fb14a064e6e2e735bb385012d46403c4c35a6286 Mon Sep 17 00:00:00 2001
From: scwf <wangfei1@huawei.com>
Date: Mon, 29 Sep 2014 21:45:36 +0800
Subject: [PATCH 01/32] initial support orc in spark sql

---
 .../plans/logical/basicOperators.scala        |  23 +
 sql/core/pom.xml                              |  15 +
 .../scala/org/apache/spark/sql/SQLConf.scala  |   7 +
 .../org/apache/spark/sql/SQLContext.scala     |  44 ++
 .../org/apache/spark/sql/SchemaRDDLike.scala  |  13 +-
 .../spark/sql/execution/SparkStrategies.scala |  22 +
 .../apache/spark/sql/orc/OrcRelation.scala    | 263 +++++++++++
 .../spark/sql/orc/OrcTableOperations.scala    | 418 ++++++++++++++++++
 8 files changed, 804 insertions(+), 1 deletion(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/orc/OrcRelation.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/orc/OrcTableOperations.scala
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
index 391508279bb80..e6fd085292271 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
@@ -114,6 +114,22 @@ case class InsertIntoTable(
   }
 }
 
+case class InsertIntoOrcTable(
+    table: LogicalPlan,
+    partition: Map[String, Option[String]],
+    child: LogicalPlan,
+    overwrite: Boolean)
+  extends LogicalPlan {
+  // The table being inserted into is a child for the purposes of transformations.
+  def children = table :: child :: Nil
+  def references = Set.empty
+  def output = child.output
+
+  override lazy val resolved = childrenResolved && child.output.zip(table.output).forall {
+    case (childAttr, tableAttr) => childAttr.dataType == tableAttr.dataType
+  }
+}
+
 case class CreateTableAsSelect(
     databaseName: Option[String],
     tableName: String,
@@ -128,6 +144,13 @@ case class WriteToFile(
   override def output = child.output
 }
 
+case class WriteToOrcFile(
+    path: String,
+    child: LogicalPlan) extends UnaryNode {
+  def references = Set.empty
+  def output = child.output
+}
+
 case class Sort(order: Seq[SortOrder], child: LogicalPlan) extends UnaryNode {
   override def output = child.output
 }
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index bd110218d34f7..724d632c480b8 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -53,6 +53,21 @@
       <type>test-jar</type>
       <scope>test</scope>
     </dependency>
+    <dependency>
+       <groupId>org.spark-project.hive</groupId>
+       <artifactId>hive-exec</artifactId>
+       <version>${hive.version}</version>
+       <exclusions>
+        <exclusion>
+          <groupId>commons-logging</groupId>
+          <artifactId>commons-logging</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.slf4j</groupId>
+          <artifactId>slf4j-api</artifactId>
+        </exclusion>
+       </exclusions>
+    </dependency>
     <dependency>
       <groupId>com.twitter</groupId>
       <artifactId>parquet-column</artifactId>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index f6f4cf3b80d41..1fca2f380e7f1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -35,6 +35,7 @@ private[spark] object SQLConf {
   val PARQUET_BINARY_AS_STRING = "spark.sql.parquet.binaryAsString"
   val PARQUET_CACHE_METADATA = "spark.sql.parquet.cacheMetadata"
   val PARQUET_COMPRESSION = "spark.sql.parquet.compression.codec"
+  val ORC_COMPRESSION = "spark.sql.orc.compression.codec"
 
   // This is only used for the thriftserver
   val THRIFTSERVER_POOL = "spark.sql.thriftserver.scheduler.pool"
@@ -83,6 +84,12 @@ private[sql] trait SQLConf {
   /** The compression codec for writing to a Parquetfile */
   private[spark] def parquetCompressionCodec: String = getConf(PARQUET_COMPRESSION, "snappy")
 
+  /** The compression codec for writing to a Orcfile
+   *  Note: only support zlib now since we use ```OrcOutputFormat.getRecordWriter``` ,which is not
+   *  allowed to configure thr compression kind
+   */
+  private[spark] def orcCompressionCodec: String = getConf(ORC_COMPRESSION, "zlib")
+
   /** The number of rows that will be  */
   private[spark] def columnBatchSize: Int = getConf(COLUMN_BATCH_SIZE, "1000").toInt
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index a42bedbe6c04e..c1b6e44af4f84 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -37,6 +37,7 @@ import org.apache.spark.sql.execution.SparkStrategies
 import org.apache.spark.sql.json._
 import org.apache.spark.sql.parquet.ParquetRelation
 import org.apache.spark.{Logging, SparkContext}
+import org.apache.spark.sql.orc.OrcRelation
 
 /**
  * :: AlphaComponent ::
@@ -145,6 +146,14 @@ class SQLContext(@transient val sparkContext: SparkContext)
   def parquetFile(path: String): SchemaRDD =
     new SchemaRDD(this, parquet.ParquetRelation(path, Some(sparkContext.hadoopConfiguration), this))
 
+  /**
+   * Loads a Orc file, returning the result as a [[SchemaRDD]].
+   *
+   * @group userf
+   */
+  def orcFile(path: String): SchemaRDD =
+    new SchemaRDD(this, orc.OrcRelation(path, Some(sparkContext.hadoopConfiguration), this))
+
   /**
    * Loads a JSON file (one object per line), returning the result as a [[SchemaRDD]].
    * It goes through the entire dataset once to determine the schema.
@@ -244,6 +253,40 @@ class SQLContext(@transient val sparkContext: SparkContext)
         path, ScalaReflection.attributesFor[A], allowExisting, conf, this))
   }
 
+  /**
+   * :: Experimental ::
+   * Creates an empty parquet file with the schema of class `A`, which can be registered as a table.
+   * This registered table can be used as the target of future `insertInto` operations.
+   *
+   * {{{
+   *   val sqlContext = new SQLContext(...)
+   *   import sqlContext._
+   *
+   *   case class Person(name: String, age: Int)
+   *   createOrcFile[Person]("path/to/file.orc").registerTempTable("people")
+   *   sql("INSERT INTO people SELECT 'michael', 29")
+   * }}}
+   *
+   * @tparam A A case class type that describes the desired schema of the parquet file to be
+   *           created.
+   * @param path The path where the directory containing parquet metadata should be created.
+   *             Data inserted into this table will also be stored at this location.
+   * @param allowExisting When false, an exception will be thrown if this directory already exists.
+   * @param conf A Hadoop configuration object that can be used to specify options to the parquet
+   *             output format.
+   *
+   * @group userf
+   */
+  @Experimental
+  def createOrcFile[A <: Product : TypeTag](
+      path: String,
+      allowExisting: Boolean = true,
+      conf: Configuration = new Configuration()): SchemaRDD = {
+    new SchemaRDD(
+      this,
+      OrcRelation.createEmpty(path, ScalaReflection.attributesFor[A], allowExisting, conf, this))
+  }
+
   /**
    * Registers the given RDD as a temporary table in the catalog.  Temporary tables exist only
    * during the lifetime of this instance of SQLContext.
@@ -328,6 +371,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
       HashJoin ::
       InMemoryScans ::
       ParquetOperations ::
+      OrcOperations::
       BasicOperators ::
       CartesianProduct ::
       BroadcastNestedLoopJoin :: Nil
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDDLike.scala b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDDLike.scala
index e52eeb3e1c47e..4e6546332cda6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDDLike.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDDLike.scala
@@ -18,9 +18,9 @@
 package org.apache.spark.sql
 
 import org.apache.spark.annotation.{DeveloperApi, Experimental}
-import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.execution.SparkLogicalPlan
+import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
 
 /**
  * Contains functions that are shared between all SchemaRDD types (i.e., Scala, Java)
@@ -77,6 +77,17 @@ private[sql] trait SchemaRDDLike {
     sqlContext.executePlan(WriteToFile(path, logicalPlan)).toRdd
   }
 
+  /**
+   * Saves the contents of this `SchemaRDD` as a orc file, preserving the schema.  Files that
+   * are written out using this method can be read back in as a SchemaRDD using the `orcFile`
+   * function.
+   *
+   * @group schema
+   */
+  def saveAsOrcFile(path: String): Unit = {
+    sqlContext.executePlan(WriteToOrcFile(path, logicalPlan)).toRdd
+  }
+
   /**
    * Registers this RDD as a temporary table using the given name.  The lifetime of this temporary
    * table is tied to the [[SQLContext]] that was used to create this SchemaRDD.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 45687d960404c..0ab938461b7c6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -26,6 +26,7 @@ import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.catalyst.types._
 import org.apache.spark.sql.columnar.{InMemoryRelation, InMemoryColumnarTableScan}
 import org.apache.spark.sql.parquet._
+import org.apache.spark.sql.orc.{OrcTableScan, InsertIntoOrcTable, OrcRelation}
 
 private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
   self: SQLContext#SparkPlanner =>
@@ -237,6 +238,27 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
     }
   }
 
+  object OrcOperations extends Strategy {
+    def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
+      case logical.WriteToOrcFile(path, child) =>
+        val relation =
+          OrcRelation.create(path, child, sparkContext.hadoopConfiguration, sqlContext)
+        InsertIntoOrcTable(relation, planLater(child), overwrite=true) :: Nil
+      case logical.InsertIntoOrcTable(table: OrcRelation, partition, child, overwrite) =>
+        InsertIntoOrcTable(table, planLater(child), overwrite) :: Nil
+      case PhysicalOperation(projectList, filters, relation: OrcRelation) =>
+        // TODO: need to implement predict push down.
+        val prunePushedDownFilters = identity[Seq[Expression]] _
+        pruneFilterProject(
+          projectList,
+          filters,
+          prunePushedDownFilters,
+          OrcTableScan(_, relation, None)) :: Nil
+
+      case _ => Nil
+    }
+  }
+
   object InMemoryScans extends Strategy {
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
       case PhysicalOperation(projectList, filters, mem: InMemoryRelation) =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/orc/OrcRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/orc/OrcRelation.scala
new file mode 100644
index 0000000000000..8b5156c0f3769
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/orc/OrcRelation.scala
@@ -0,0 +1,263 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.orc
+
+import org.apache.hadoop.hive.conf.HiveConf
+import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, LeafNode}
+import org.apache.spark.sql.catalyst.analysis.{UnresolvedException, MultiInstanceRelation}
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.catalyst.expressions.AttributeReference
+import org.apache.spark.sql.catalyst.types._
+import org.apache.hadoop.fs.{FileSystem, Path}
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.permission.FsAction
+import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector
+import org.apache.hadoop.hive.ql.io.orc._
+import org.apache.hadoop.mapred.{FileInputFormat => NewFileInputFormat, JobConf}
+import org.apache.hadoop.hive.ql.io.orc.OrcProto.Type.Kind
+import org.apache.hadoop.mapreduce.Job
+import parquet.hadoop.util.ContextUtil
+import java.util.Properties
+import java.io.IOException
+import scala.collection.mutable
+import org.apache.spark.sql.SQLContext
+
+
+private[sql] case class OrcRelation(
+    path: String,
+    @transient conf: Option[Configuration],
+    @transient sqlContext: SQLContext,
+    partitioningAttributes: Seq[Attribute] = Nil)
+  extends LeafNode with MultiInstanceRelation {
+  self: Product =>
+
+  val prop: Properties = new Properties
+
+  var rowClass: Class[_] = null
+
+  val fieldIdCache: mutable.Map[String, Int] = new mutable.HashMap[String, Int]
+
+  val fieldNameTypeCache: mutable.Map[String, String] = new mutable.HashMap[String, String]
+
+  override val output = orcSchema
+
+  def orcSchema: Seq[Attribute] = {
+    val origPath = new Path(path)
+    val reader = OrcFileOperator.readMetaData(origPath)
+
+    if (null != reader) {
+      val inspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector]
+      val fields = inspector.getAllStructFieldRefs
+
+      if (fields.size() == 0) {
+        return Seq.empty
+      }
+
+      val totalType = reader.getTypes.get(0)
+      val keys = totalType.getFieldNamesList
+      val types = totalType.getSubtypesList
+      log.info("field name is {}", keys)
+      log.info("types is {}", types)
+
+      val colBuff = new StringBuilder
+      val typeBuff = new StringBuilder
+      for (i <- 0 until fields.size()) {
+        val fieldName = fields.get(i).getFieldName
+        val typeName = fields.get(i).getFieldObjectInspector.getTypeName
+        colBuff.append(fieldName)
+        fieldNameTypeCache.put(fieldName, typeName)
+        fieldIdCache.put(fieldName, i)
+        colBuff.append(",")
+        typeBuff.append(typeName)
+        typeBuff.append(":")
+      }
+      colBuff.setLength(colBuff.length - 1)
+      typeBuff.setLength(typeBuff.length - 1)
+      prop.setProperty("columns", colBuff.toString())
+      prop.setProperty("columns.types", typeBuff.toString())
+      val attributes = convertToAttributes(reader, keys, types)
+      attributes
+    } else {
+      Seq.empty
+    }
+  }
+
+  def convertToAttributes(
+     reader: Reader,
+     keys: java.util.List[String],
+     types: java.util.List[Integer]): Seq[Attribute] = {
+    val range = 0.until(keys.size())
+    range.map {
+      i => reader.getTypes.get(types.get(i)).getKind match {
+        case Kind.BOOLEAN =>
+          new AttributeReference(keys.get(i), BooleanType, false)()
+        case Kind.STRING =>
+          new AttributeReference(keys.get(i), StringType, true)()
+        case Kind.BYTE =>
+          new AttributeReference(keys.get(i), ByteType, true)()
+        case Kind.SHORT =>
+          new AttributeReference(keys.get(i), ShortType, true)()
+        case Kind.INT =>
+          new AttributeReference(keys.get(i), IntegerType, true)()
+        case Kind.LONG =>
+          new AttributeReference(keys.get(i), LongType, false)()
+        case Kind.FLOAT =>
+          new AttributeReference(keys.get(i), FloatType, false)()
+        case Kind.DOUBLE =>
+          new AttributeReference(keys.get(i), DoubleType, false)()
+        case _ => {
+          log.info("unsupported datatype")
+          null
+        }
+      }
+    }
+  }
+
+  override def newInstance() = OrcRelation(path, conf, sqlContext).asInstanceOf[this.type]
+}
+
+private[sql] object OrcRelation {
+
+
+  // The orc compression short names
+  val shortOrcCompressionCodecNames = Map(
+    "NONE"         -> CompressionKind.NONE,
+    "UNCOMPRESSED" -> CompressionKind.NONE,
+    "SNAPPY"       -> CompressionKind.SNAPPY,
+    "ZLIB"         -> CompressionKind.ZLIB,
+    "LZO"          -> CompressionKind.LZO)
+
+  /**
+   * Creates a new OrcRelation and underlying Orcfile for the given LogicalPlan. Note that
+   * this is used inside [[org.apache.spark.sql.execution.SparkStrategies]] to
+   * create a resolved relation as a data sink for writing to a Orcfile.
+   *
+   * @param pathString The directory the ORCfile will be stored in.
+   * @param child The child node that will be used for extracting the schema.
+   * @param conf A configuration to be used.
+   * @return An empty OrcRelation with inferred metadata.
+   */
+  def create(pathString: String,
+             child: LogicalPlan,
+             conf: Configuration,
+             sqlContext: SQLContext): OrcRelation = {
+    if (!child.resolved) {
+      throw new UnresolvedException[LogicalPlan](
+        child,
+        "Attempt to create Orc table from unresolved child")
+    }
+    createEmpty(pathString, child.output, false, conf, sqlContext)
+  }
+
+  /**
+   * Creates an empty OrcRelation and underlying Orcfile that only
+   * consists of the Metadata for the given schema.
+   *
+   * @param pathString The directory the Orcfile will be stored in.
+   * @param attributes The schema of the relation.
+   * @param conf A configuration to be used.
+   * @return An empty OrcRelation.
+   */
+  def createEmpty(pathString: String,
+                  attributes: Seq[Attribute],
+                  allowExisting: Boolean,
+                  conf: Configuration,
+                  sqlContext: SQLContext): OrcRelation = {
+    val path = checkPath(pathString, allowExisting, conf)
+
+   /** set compression kind in hive 0.13.1
+    * conf.set(
+    *   HiveConf.ConfVars.OHIVE_ORC_DEFAULT_COMPRESS.varname,
+    *   shortOrcCompressionCodecNames.getOrElse(
+    *    sqlContext.orcCompressionCodec.toUpperCase, CompressionKind.NONE).name)
+    */
+    val orcRelation = new OrcRelation(path.toString, Some(conf), sqlContext)
+
+    orcRelation
+  }
+
+  private def checkPath(pathStr: String, allowExisting: Boolean, conf: Configuration): Path = {
+    if (pathStr == null) {
+      throw new IllegalArgumentException("Unable to create OrcRelation: path is null")
+    }
+    val origPath = new Path(pathStr)
+    val fs = origPath.getFileSystem(conf)
+    if (fs == null) {
+      throw new IllegalArgumentException(
+        s"Unable to create OrcRelation: incorrectly formatted path $pathStr")
+    }
+    val path = origPath.makeQualified(fs)
+    if (!allowExisting && fs.exists(path)) {
+      sys.error(s"File $pathStr already exists.")
+    }
+
+    if (fs.exists(path) &&
+      !fs.getFileStatus(path)
+        .getPermission
+        .getUserAction
+        .implies(FsAction.READ_WRITE)) {
+      throw new IOException(
+        s"Unable to create OrcRelation: path $path not read-writable")
+    }
+    path
+  }
+}
+
+private[sql] object OrcFileOperator {
+  final val COMPRESSION: String = "orcfiles.compression"
+
+  /**
+   *
+   * @param origPath
+   * @return
+   */
+  def readMetaData(origPath: Path): Reader = {
+    val job = new Job()
+    val conf = ContextUtil.getConfiguration(job).asInstanceOf[JobConf]
+    val fs: FileSystem = origPath.getFileSystem(conf)
+    val orcFiles = FileSystemHelper.listFiles(origPath, conf, ".orc")
+    if (orcFiles != Seq.empty) {
+//      NewFileInputFormat.setInputPaths(conf, orcFiles(0))// why set inputpath here
+      if (fs.exists(origPath)) {
+        OrcFile.createReader(fs, orcFiles(0))
+      } else {
+        null
+      }
+    } else {
+      null
+    }
+  }
+
+
+  def writeMetaData(attributes: Seq[Attribute], origPath: Path, conf: Configuration) {
+    if (origPath == null) {
+      throw new IllegalArgumentException("Unable to write Parquet metadata: path is null")
+    }
+    val fs = origPath.getFileSystem(conf)
+    if (fs == null) {
+      throw new IllegalArgumentException(
+        s"Unable to write Orc metadata: path $origPath is incorrectly formatted")
+    }
+
+    val path = origPath.makeQualified(fs)
+    if (fs.exists(path) && !fs.getFileStatus(path).isDir) {
+      throw new IllegalArgumentException(s"Expected to write to directory $path but found file")
+    }
+  }
+}
+
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/orc/OrcTableOperations.scala b/sql/core/src/main/scala/org/apache/spark/sql/orc/OrcTableOperations.scala
new file mode 100644
index 0000000000000..bb1f342cc9ee7
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/orc/OrcTableOperations.scala
@@ -0,0 +1,418 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.spark.sql.orc
+
+import org.apache.spark.sql.execution.{ExistingRdd, LeafNode, UnaryNode, SparkPlan}
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.{TaskContext, SerializableWritable}
+import org.apache.spark.rdd.RDD
+
+import _root_.parquet.hadoop.util.ContextUtil
+import org.apache.hadoop.fs.Path
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.mapreduce.lib.output.{FileOutputFormat, FileOutputCommitter}
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
+import org.apache.hadoop.io.{Writable, NullWritable}
+import org.apache.hadoop.mapreduce.{TaskID, TaskAttemptContext, Job}
+
+import org.apache.hadoop.hive.ql.io.orc.{OrcFile, OrcSerde, OrcInputFormat, OrcOutputFormat}
+import org.apache.hadoop.hive.serde2.objectinspector._
+import org.apache.hadoop.hive.serde2.ColumnProjectionUtils
+import org.apache.hadoop.hive.common.`type`.{HiveDecimal, HiveVarchar}
+
+import java.io.IOException
+import java.text.SimpleDateFormat
+import java.util.{Locale, Date}
+import scala.collection.JavaConversions._
+import org.apache.hadoop.mapred.{SparkHadoopMapRedUtil, Reporter, JobConf}
+
+/**
+ * orc table scan operator. Imports the file that backs the given
+ * [[org.apache.spark.sql.orc.OrcRelation]] as a ``RDD[Row]``.
+ */
+case class OrcTableScan(
+   output: Seq[Attribute],
+   relation: OrcRelation,
+   columnPruningPred: Option[Expression])
+  extends LeafNode {
+
+  @transient
+  lazy val serde: OrcSerde = initSerde
+
+  @transient
+  lazy val getFieldValue: Seq[Product => Any] = {
+    val inspector = serde.getObjectInspector.asInstanceOf[StructObjectInspector]
+    output.map(attr => {
+      val ref = inspector.getStructFieldRef(attr.name.toLowerCase(Locale.ENGLISH))
+      row: Product => {
+        val fieldData = row.productElement(1)
+        val data = inspector.getStructFieldData(fieldData, ref)
+        unwrapData(data, ref.getFieldObjectInspector)
+      }
+    })
+  }
+
+  private def initSerde(): OrcSerde = {
+    val serde = new OrcSerde
+    serde.initialize(null, relation.prop)
+    serde
+  }
+
+  def unwrapData(data: Any, oi: ObjectInspector): Any = oi match {
+    case pi: PrimitiveObjectInspector => pi.getPrimitiveJavaObject(data)
+    case li: ListObjectInspector =>
+      Option(li.getList(data))
+        .map(_.map(unwrapData(_, li.getListElementObjectInspector)).toSeq)
+        .orNull
+    case mi: MapObjectInspector =>
+      Option(mi.getMap(data)).map(
+        _.map {
+          case (k, v) =>
+            (unwrapData(k, mi.getMapKeyObjectInspector),
+              unwrapData(v, mi.getMapValueObjectInspector))
+        }.toMap).orNull
+    case si: StructObjectInspector =>
+      val allRefs = si.getAllStructFieldRefs
+      new GenericRow(
+        allRefs.map(r =>
+          unwrapData(si.getStructFieldData(data, r), r.getFieldObjectInspector)).toArray)
+  }
+
+  override def execute(): RDD[Row] = {
+    val sc = sqlContext.sparkContext
+    val job = new Job(sc.hadoopConfiguration)
+
+    val conf: Configuration = ContextUtil.getConfiguration(job)
+    val fileList = FileSystemHelper.listFiles(relation.path, conf)
+
+    // add all paths in the directory but skip "hidden" ones such
+    // as "_SUCCESS"
+    for (path <- fileList if !path.getName.startsWith("_")) {
+      FileInputFormat.addInputPath(job, path)
+    }
+    val serialConf = sc.broadcast(new SerializableWritable(conf))
+
+    setColumnIds(output, relation, conf)
+    val inputClass = classOf[OrcInputFormat].asInstanceOf[
+      Class[_ <: org.apache.hadoop.mapred.InputFormat[Void, Row]]]
+
+    val rowRdd = productToRowRdd(sc.hadoopRDD[Void, Row](
+      serialConf.value.value.asInstanceOf[JobConf], inputClass, classOf[Void], classOf[Row]))
+    rowRdd
+  }
+
+  /**
+   * @param output
+   * @param relation
+   * @param conf
+   */
+  def setColumnIds(output: Seq[Attribute], relation: OrcRelation, conf: Configuration) {
+    val idBuff = new StringBuilder()
+
+    output.map(att => {
+      val realName = att.name.toLowerCase(Locale.ENGLISH)
+      val id = relation.fieldIdCache.getOrElse(realName, null)
+      if (null != id) {
+        idBuff.append(id)
+        idBuff.append(",")
+      }
+    })
+    if (idBuff.length > 0) {
+      idBuff.setLength(idBuff.length - 1)
+    }
+    conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, idBuff.toString())
+  }
+
+  /**
+   *
+   * @param data
+   * @tparam A
+   * @return
+   */
+  def productToRowRdd[A <: Product](data: RDD[A]): RDD[Row] = {
+    data.mapPartitions {
+      iterator =>
+        if (iterator.isEmpty) {
+          Iterator.empty
+        } else {
+          val bufferedIterator = iterator.buffered
+          bufferedIterator.map {r =>
+              val values = getFieldValue.map(_(r))
+              new GenericRow(values.map {
+                case n: String if n.toLowerCase == "null" => ""
+                case varchar: HiveVarchar => varchar.getValue
+                case decimal: HiveDecimal =>
+                  BigDecimal(decimal.bigDecimalValue)
+                case null => ""
+                case other => other
+
+              }.toArray)
+          }
+        }
+    }
+  }
+
+  /**
+   * Applies a (candidate) projection.
+   *
+   * @param prunedAttributes The list of attributes to be used in the projection.
+   * @return Pruned TableScan.
+   */
+  def pruneColumns(prunedAttributes: Seq[Attribute]): OrcTableScan = {
+    OrcTableScan(prunedAttributes, relation, columnPruningPred)
+  }
+}
+
+/**
+ *
+ * @param relation
+ * @param child
+ * @param overwrite
+ */
+private[sql] case class InsertIntoOrcTable(
+    relation: OrcRelation,
+    child: SparkPlan,
+    overwrite: Boolean = false)
+  extends UnaryNode with SparkHadoopMapRedUtil with org.apache.spark.Logging {
+
+  override def output = child.output
+
+  val intputClass: Class[_] = getInputClass
+
+  @transient val sc = sqlContext.sparkContext
+
+  val inspector = sc.broadcast(getInspector(intputClass))
+
+  @transient lazy val orcSerde = initFieldInfo
+
+  private def getInputClass: Class[_] = {
+    val existRdd = child.asInstanceOf[ExistingRdd]
+    val productClass = existRdd.rdd.firstParent.elementClassTag.runtimeClass
+    logInfo("productClass is " + productClass)
+    val clazz = productClass
+    if (null == relation.rowClass) {
+      relation.rowClass = clazz
+    }
+    clazz
+  }
+
+  private def getInspector(clazz: Class[_]): ObjectInspector = {
+    val inspector = ObjectInspectorFactory.getReflectionObjectInspector(clazz,
+      ObjectInspectorFactory.ObjectInspectorOptions.JAVA)
+    inspector
+  }
+
+  private def initFieldInfo(): OrcSerde = {
+    val serde: OrcSerde = new OrcSerde
+    serde
+  }
+
+  /**
+   * Inserts all rows into the Orc file.
+   */
+  override def execute() = {
+    val childRdd = child.execute()
+    assert(childRdd != null)
+
+    val job = new Job(sqlContext.sparkContext.hadoopConfiguration)
+    // TODO: move that to function in object
+    val conf = job.getConfiguration
+
+    val fspath = new Path(relation.path)
+    val fs = fspath.getFileSystem(conf)
+
+    if (overwrite) {
+      try {
+        fs.delete(fspath, true)
+      } catch {
+        case e: IOException =>
+          throw new IOException(
+            s"Unable to clear output directory ${fspath.toString} prior"
+              + s" to InsertIntoOrcTable:\n${e.toString}")
+      }
+    }
+
+    val existRdd = child.asInstanceOf[ExistingRdd]
+    val parentRdd = existRdd.rdd.firstParent[Product]
+    val writableRdd = parentRdd.map(obj => {
+      orcSerde.serialize(obj, inspector.value)
+    })
+
+    saveAsHadoopFile(writableRdd, relation.rowClass, relation.path, conf)
+
+    // We return the child RDD to allow chaining (alternatively, one could return nothing).
+    childRdd
+  }
+
+
+  // based on ``saveAsNewAPIHadoopFile`` in [[PairRDDFunctions]]
+  // TODO: Maybe PairRDDFunctions should use Product2 instead of Tuple2?
+  // .. then we could use the default one and could use [[MutablePair]]
+  // instead of ``Tuple2``
+  private def saveAsHadoopFile(
+                                rdd: RDD[Writable],
+                                rowClass: Class[_],
+                                path: String,
+                                @transient conf: Configuration) {
+    val job = new Job(conf)
+    val keyType = classOf[Void]
+    job.setOutputKeyClass(keyType)
+    job.setOutputValueClass(classOf[Writable])
+    FileOutputFormat.setOutputPath(job, new Path(path))
+
+    val wrappedConf = new SerializableWritable(job.getConfiguration)
+
+    val formatter = new SimpleDateFormat("yyyyMMddHHmm")
+    val jobtrackerID = formatter.format(new Date())
+    val stageId = sqlContext.sparkContext.newRddId()
+
+    val taskIdOffset =
+      if (overwrite) {
+        1
+      } else {
+        FileSystemHelper
+          .findMaxTaskId(FileOutputFormat.getOutputPath(job).toString, job.getConfiguration) + 1
+      }
+
+    def getWriter(
+                   outFormat: OrcOutputFormat,
+                   conf: Configuration,
+                   path: Path,
+                   reporter: Reporter) = {
+      val fs = path.getFileSystem(conf)
+      outFormat.getRecordWriter(fs, conf.asInstanceOf[JobConf], path.toUri.getPath, reporter).
+        asInstanceOf[org.apache.hadoop.mapred.RecordWriter[NullWritable, Writable]]
+    }
+
+    def getCommitterAndWriter(offset: Int, context: TaskAttemptContext) = {
+      val outFormat = new OrcOutputFormat
+
+      val taskId: TaskID = context.getTaskAttemptID.getTaskID
+      val partition: Int = taskId.getId
+      val filename = s"part-r-${partition + offset}.orc"
+      val output: Path = FileOutputFormat.getOutputPath(context)
+      val committer = new FileOutputCommitter(output, context)
+      val path = new Path(committer.getWorkPath, filename)
+      val writer = getWriter(outFormat, wrappedConf.value, path, Reporter.NULL)
+      (committer, writer)
+    }
+
+    def writeShard(context: TaskContext, iter: Iterator[Writable]): Int = {
+      // Hadoop wants a 32-bit task attempt ID, so if ours is bigger than Int.MaxValue, roll it
+      // around by taking a mod. We expect that no task will be attempted 2 billion times.
+      val attemptNumber = (context.attemptId % Int.MaxValue).toInt
+      /* "reduce task" <split #> <attempt # = spark task #> */
+      val attemptId = newTaskAttemptID(jobtrackerID, stageId, isMap = false, context.partitionId,
+        attemptNumber)
+      val hadoopContext = newTaskAttemptContext(wrappedConf.value.asInstanceOf[JobConf], attemptId)
+      val workerAndComitter = getCommitterAndWriter(taskIdOffset, hadoopContext)
+      val writer = workerAndComitter._2
+
+      while (iter.hasNext) {
+        val row = iter.next()
+        writer.write(NullWritable.get(), row)
+      }
+
+      writer.close(Reporter.NULL)
+      workerAndComitter._1.commitTask(hadoopContext)
+      return 1
+    }
+
+    /* apparently we need a TaskAttemptID to construct an OutputCommitter;
+     * however we're only going to use this local OutputCommitter for
+     * setupJob/commitJob, so we just use a dummy "map" task.
+     */
+    val jobAttemptId = newTaskAttemptID(jobtrackerID, stageId, isMap = true, 0, 0)
+    val jobTaskContext = newTaskAttemptContext(
+      wrappedConf.value.asInstanceOf[JobConf], jobAttemptId)
+    val workerAndComitter = getCommitterAndWriter(taskIdOffset, jobTaskContext)
+    workerAndComitter._1.setupJob(jobTaskContext)
+    sc.runJob(rdd, writeShard _)
+    workerAndComitter._1.commitJob(jobTaskContext)
+  }
+
+}
+
+/**
+ *
+ */
+private[orc] object FileSystemHelper {
+  /**
+   *
+   * @param pathStr
+   * @param conf
+   * @return
+   */
+  def listFiles(pathStr: String, conf: Configuration): Seq[Path] = {
+    val origPath = new Path(pathStr)
+    val fs = origPath.getFileSystem(conf)
+    if (fs == null) {
+      throw new IllegalArgumentException(
+        s"OrcTableOperations: Path $origPath is incorrectly formatted")
+    }
+    val path = origPath.makeQualified(fs)
+    if (!fs.exists(path) || !fs.getFileStatus(path).isDir) {
+      Seq.empty
+    } else {
+      fs.listStatus(path).map(_.getPath)
+    }
+  }
+
+  /**
+   *
+   * @param origPath
+   * @param conf
+   * @param extension
+   * @return
+   */
+  def listFiles(origPath: Path, conf: Configuration, extension: String): Seq[Path] = {
+    val fs = origPath.getFileSystem(conf)
+    if (fs == null) {
+      throw new IllegalArgumentException(
+        s"OrcTableOperations: Path $origPath is incorrectly formatted")
+    }
+    val path = origPath.makeQualified(fs)
+    if (fs.exists(path) && fs.getFileStatus(path).isDir) {
+      fs.listStatus(path).map(_.getPath).filter(p => p.getName.endsWith(extension))
+    } else {
+      Seq.empty
+    }
+  }
+
+  /**
+   * Finds the maximum taskid in the output file names at the given path.
+   */
+  def findMaxTaskId(pathStr: String, conf: Configuration): Int = {
+    val files = FileSystemHelper.listFiles(pathStr, conf)
+    // filename pattern is part-r-<int>.orc
+    val nameP = new scala.util.matching.Regex( """part-r-(\d{1,}).orc""", "taskid")
+    val hiddenFileP = new scala.util.matching.Regex("_.*")
+    files.map(_.getName).map {
+      case nameP(taskid) => taskid.toInt
+      case hiddenFileP() => 0
+      case other: String => {
+        sys.error("ERROR: attempting to append to set of Orc files and found file" +
+          s"that does not match name pattern: $other")
+        0
+      }
+      case _ => 0
+    }.reduceLeft((a, b) => if (a < b) b else a)
+  }
+}
+

From ec3cdaf6fbee283107e409356d115757fbfc1ff4 Mon Sep 17 00:00:00 2001
From: scwf <wangfei1@huawei.com>
Date: Mon, 29 Sep 2014 21:45:55 +0800
Subject: [PATCH 02/32] add unit tests

---
 .../apache/spark/sql/orc/ORCQuerySuite.scala  | 184 ++++++++++++++++++
 1 file changed, 184 insertions(+)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/orc/ORCQuerySuite.scala

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/orc/ORCQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/orc/ORCQuerySuite.scala
new file mode 100644
index 0000000000000..6daf3b5855f4d
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/orc/ORCQuerySuite.scala
@@ -0,0 +1,184 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.orc
+
+import org.apache.hadoop.fs.Path
+import org.apache.hadoop.hive.ql.io.orc.CompressionKind
+import org.apache.spark.sql.{SQLConf, SchemaRDD, TestData, QueryTest}
+import org.apache.spark.sql.test.TestSQLContext
+import org.scalatest.{BeforeAndAfterAll, FunSuiteLike}
+import org.apache.spark.util.Utils
+import org.apache.spark.sql.catalyst.util.getTempFilePath
+import org.apache.spark.sql.test.TestSQLContext._
+
+import java.io.File
+
+case class TestRDDEntry(key: Int, value: String)
+
+case class NullReflectData(
+    intField: java.lang.Integer,
+    longField: java.lang.Long,
+    floatField: java.lang.Float,
+    doubleField: java.lang.Double,
+    booleanField: java.lang.Boolean)
+
+case class OptionalReflectData(
+    intField: Option[Int],
+    longField: Option[Long],
+    floatField: Option[Float],
+    doubleField: Option[Double],
+    booleanField: Option[Boolean])
+
+case class Nested(i: Int, s: String)
+
+case class Data(array: Seq[Int], nested: Nested)
+
+case class AllDataTypes(
+    stringField: String,
+    intField: Int,
+    longField: Long,
+    floatField: Float,
+    doubleField: Double,
+    shortField: Short,
+    byteField: Byte,
+    booleanField: Boolean)
+
+case class AllDataTypesWithNonPrimitiveType(
+    stringField: String,
+    intField: Int,
+    longField: Long,
+    floatField: Float,
+    doubleField: Double,
+    shortField: Short,
+    byteField: Byte,
+    booleanField: Boolean,
+    array: Seq[Int],
+    arrayContainsNull: Seq[Option[Int]],
+    map: Map[Int, Long],
+    mapValueContainsNull: Map[Int, Option[Long]],
+    data: Data)
+
+case class BinaryData(binaryData: Array[Byte])
+
+class OrcQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterAll {
+  TestData // Load test data tables.
+
+  var testRDD: SchemaRDD = null
+  test("Read/Write All Types") {
+    val tempDir = getTempFilePath("orcTest").getCanonicalPath
+    val range = (0 to 255)
+    val data = sparkContext.parallelize(range)
+      .map(x => AllDataTypes(s"$x", x, x.toLong, x.toFloat, x.toDouble, x.toShort, x.toByte, x % 2 == 0))
+
+    data.saveAsOrcFile(tempDir)
+
+    checkAnswer(
+      orcFile(tempDir),
+      data.toSchemaRDD.collect().toSeq)
+
+    Utils.deleteRecursively(new File(tempDir))
+
+  }
+
+  test("Compression options for writing to a Orcfile") {
+    val defaultOrcCompressionCodec = TestSQLContext.orcCompressionCodec
+    //TODO: support other compress codec
+    val file = getTempFilePath("orcTest")
+    val path = file.toString
+    val rdd = TestSQLContext.sparkContext.parallelize((1 to 100))
+      .map(i => TestRDDEntry(i, s"val_$i"))
+
+    // test default compression codec, now only support zlib
+    rdd.saveAsOrcFile(path)
+    var actualCodec = OrcFileOperator.readMetaData(new Path(path)).getCompression.name
+    assert(actualCodec == TestSQLContext.orcCompressionCodec.toUpperCase)
+
+    /**
+    orcFile(path).registerTempTable("tmp")
+    checkAnswer(
+      sql("SELECT key, value FROM tmp WHERE value = 'val_5' OR value = 'val_7'"),
+      (5, "val_5") ::
+        (7, "val_7") :: Nil)
+
+    Utils.deleteRecursively(file)
+
+    // test uncompressed parquet file with property value "UNCOMPRESSED"
+    TestSQLContext.setConf(SQLConf.ORC_COMPRESSION, "UNCOMPRESSED")
+
+    rdd.saveAsOrcFile(path)
+    actualCodec = OrcFileOperator.readMetaData(new Path(path)).getCompression.name
+    assert(actualCodec == OrcRelation.shortOrcCompressionCodecNames.getOrElse(
+      TestSQLContext.orcCompressionCodec.toUpperCase, CompressionKind.NONE).name.toUpperCase)
+
+    orcFile(path).registerTempTable("tmp")
+    checkAnswer(
+      sql("SELECT key, value FROM tmp WHERE value = 'val_5' OR value = 'val_7'"),
+      (5, "val_5") ::
+        (7, "val_7") :: Nil)
+
+    Utils.deleteRecursively(file)
+
+    // test uncompressed parquet file with property value "none"
+    TestSQLContext.setConf(SQLConf.ORC_COMPRESSION, "none")
+
+    rdd.saveAsOrcFile(path)
+    actualCodec = OrcFileOperator.readMetaData(new Path(path)).getCompression.name()
+    assert(actualCodec ==TestSQLContext.orcCompressionCodec.toUpperCase)
+
+    orcFile(path).registerTempTable("tmp")
+    checkAnswer(
+      sql("SELECT key, value FROM tmp WHERE value = 'val_5' OR value = 'val_7'"),
+      (5, "val_5") ::
+        (7, "val_7") :: Nil)
+
+    Utils.deleteRecursively(file)
+
+    // test gzip compression codec
+    TestSQLContext.setConf(SQLConf.ORC_COMPRESSION, "zlib")
+
+    rdd.saveAsOrcFile(path)
+    actualCodec = OrcFileOperator.readMetaData(new Path(path)).getCompression.name()
+    assert(actualCodec ==TestSQLContext.orcCompressionCodec.toUpperCase)
+
+    orcFile(path).registerTempTable("tmp")
+    checkAnswer(
+      sql("SELECT key, value FROM tmp WHERE value = 'val_5' OR value = 'val_7'"),
+      (5, "val_5") ::
+        (7, "val_7") :: Nil)
+
+    Utils.deleteRecursively(file)
+
+    // test snappy compression codec
+    TestSQLContext.setConf(SQLConf.ORC_COMPRESSION, "snappy")
+
+    rdd.saveAsOrcFile(path)
+    actualCodec = OrcFileOperator.readMetaData(new Path(path)).getCompression.name()
+    assert(actualCodec ==TestSQLContext.orcCompressionCodec.toUpperCase)
+
+    orcFile(path).registerTempTable("tmp")
+    checkAnswer(
+      sql("SELECT key, value FROM tmp WHERE value = 'val_5' OR value = 'val_7'"),
+      (5, "val_5") ::
+        (7, "val_7") :: Nil)
+
+    Utils.deleteRecursively(file)
+
+    TestSQLContext.setConf(SQLConf.ORC_COMPRESSION, defaultOrcCompressionCodec)
+    */
+  }
+}

From 71262909ac01f178c55d8db113b2c6a12fdd1654 Mon Sep 17 00:00:00 2001
From: scwf <wangfei1@huawei.com>
Date: Mon, 29 Sep 2014 22:25:42 +0800
Subject: [PATCH 03/32] add orc to example of spark sql

---
 .../org/apache/spark/examples/sql/RDDRelation.scala   | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala b/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala
index d56d64c564200..7610d36c698e7 100644
--- a/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala
@@ -59,14 +59,25 @@ object RDDRelation {
     // Write out an RDD as a parquet file.
     rdd.saveAsParquetFile("pair.parquet")
 
+    // Write out an RDD as a orc file.
+    rdd.saveAsOrcFile("pair.orc")
+
+
     // Read in parquet file.  Parquet files are self-describing so the schmema is preserved.
     val parquetFile = sqlContext.parquetFile("pair.parquet")
 
+    // Read in orc file.  orc files are self-describing so the schmema is preserved.
+    val orcFile = sqlContext.orcFile("pair.orc")
+
+
     // Queries can be run using the DSL on parequet files just like the original RDD.
     parquetFile.where('key === 1).select('value as 'a).collect().foreach(println)
 
     // These files can also be registered as tables.
     parquetFile.registerTempTable("parquetFile")
     sql("SELECT * FROM parquetFile").collect().foreach(println)
+
+    orcFile.registerTempTable("orcFile")
+    sql("SELECT * FROM orcFile").collect().foreach(println)
   }
 }

From 1505af48becdca9f17d84d9c7a0ef7f03dbc4e8a Mon Sep 17 00:00:00 2001
From: scwf <wangfei1@huawei.com>
Date: Sat, 4 Oct 2014 15:34:27 +0800
Subject: [PATCH 04/32] fix according comments and move orc to hive sub project

---
 .../spark/examples/sql/RDDRelation.scala      |  11 --
 .../examples/sql/hive/HiveFromSpark.scala     |  10 +
 sql/core/pom.xml                              |  15 --
 .../scala/org/apache/spark/sql/SQLConf.scala  |   7 -
 .../org/apache/spark/sql/SQLContext.scala     |  46 +----
 .../org/apache/spark/sql/SchemaRDDLike.scala  |   4 +-
 .../spark/sql/execution/SparkStrategies.scala |  22 ---
 .../sql/parquet/ParquetTableOperations.scala  |  40 +++-
 .../apache/spark/sql/orc/ORCQuerySuite.scala  | 184 ------------------
 .../apache/spark/sql/hive/HiveContext.scala   |  50 ++++-
 .../spark/sql/hive/HiveStrategies.scala       |  21 ++
 .../spark/sql/hive}/orc/OrcRelation.scala     |  93 ++++-----
 .../sql/hive}/orc/OrcTableOperations.scala    | 166 +++++-----------
 .../spark/sql/hive/orc/ORCQuerySuite.scala    |  74 +++++++
 14 files changed, 274 insertions(+), 469 deletions(-)
 delete mode 100644 sql/core/src/test/scala/org/apache/spark/sql/orc/ORCQuerySuite.scala
 rename sql/{core/src/main/scala/org/apache/spark/sql => hive/src/main/scala/org/apache/spark/sql/hive}/orc/OrcRelation.scala (79%)
 rename sql/{core/src/main/scala/org/apache/spark/sql => hive/src/main/scala/org/apache/spark/sql/hive}/orc/OrcTableOperations.scala (71%)
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/ORCQuerySuite.scala

diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala b/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala
index c882d40c424e5..2e98b2dc30b80 100644
--- a/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala
@@ -59,17 +59,9 @@ object RDDRelation {
     // Write out an RDD as a parquet file.
     rdd.saveAsParquetFile("pair.parquet")
 
-    // Write out an RDD as a orc file.
-    rdd.saveAsOrcFile("pair.orc")
-
-
     // Read in parquet file.  Parquet files are self-describing so the schmema is preserved.
     val parquetFile = sqlContext.parquetFile("pair.parquet")
 
-    // Read in orc file.  orc files are self-describing so the schmema is preserved.
-    val orcFile = sqlContext.orcFile("pair.orc")
-
-
     // Queries can be run using the DSL on parequet files just like the original RDD.
     parquetFile.where('key === 1).select('value as 'a).collect().foreach(println)
 
@@ -77,9 +69,6 @@ object RDDRelation {
     parquetFile.registerTempTable("parquetFile")
     sql("SELECT * FROM parquetFile").collect().foreach(println)
 
-    orcFile.registerTempTable("orcFile")
-    sql("SELECT * FROM orcFile").collect().foreach(println)
-
     sc.stop()
   }
 }
diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/hive/HiveFromSpark.scala b/examples/src/main/scala/org/apache/spark/examples/sql/hive/HiveFromSpark.scala
index e26f213e8afa8..ea958eddb8f8c 100644
--- a/examples/src/main/scala/org/apache/spark/examples/sql/hive/HiveFromSpark.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/sql/hive/HiveFromSpark.scala
@@ -62,6 +62,16 @@ object HiveFromSpark {
     println("Result of SELECT *:")
     sql("SELECT * FROM records r JOIN src s ON r.key = s.key").collect().foreach(println)
 
+    // Write out an RDD as a orc file.
+    rdd.saveAsOrcFile("pair.orc")
+
+    // Read in orc file. Orc files are self-describing so the schmema is preserved.
+    val orcFile = hiveContext.orcFile("pair.orc")
+
+    // These files can also be registered as tables.
+    orcFile.registerTempTable("orcFile")
+    sql("SELECT * FROM records r JOIN orcFile s ON r.key = s.key").collect().foreach(println)
+
     sc.stop()
   }
 }
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 724d632c480b8..bd110218d34f7 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -53,21 +53,6 @@
       <type>test-jar</type>
       <scope>test</scope>
     </dependency>
-    <dependency>
-       <groupId>org.spark-project.hive</groupId>
-       <artifactId>hive-exec</artifactId>
-       <version>${hive.version}</version>
-       <exclusions>
-        <exclusion>
-          <groupId>commons-logging</groupId>
-          <artifactId>commons-logging</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.slf4j</groupId>
-          <artifactId>slf4j-api</artifactId>
-        </exclusion>
-       </exclusions>
-    </dependency>
     <dependency>
       <groupId>com.twitter</groupId>
       <artifactId>parquet-column</artifactId>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index 1fca2f380e7f1..f6f4cf3b80d41 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -35,7 +35,6 @@ private[spark] object SQLConf {
   val PARQUET_BINARY_AS_STRING = "spark.sql.parquet.binaryAsString"
   val PARQUET_CACHE_METADATA = "spark.sql.parquet.cacheMetadata"
   val PARQUET_COMPRESSION = "spark.sql.parquet.compression.codec"
-  val ORC_COMPRESSION = "spark.sql.orc.compression.codec"
 
   // This is only used for the thriftserver
   val THRIFTSERVER_POOL = "spark.sql.thriftserver.scheduler.pool"
@@ -84,12 +83,6 @@ private[sql] trait SQLConf {
   /** The compression codec for writing to a Parquetfile */
   private[spark] def parquetCompressionCodec: String = getConf(PARQUET_COMPRESSION, "snappy")
 
-  /** The compression codec for writing to a Orcfile
-   *  Note: only support zlib now since we use ```OrcOutputFormat.getRecordWriter``` ,which is not
-   *  allowed to configure thr compression kind
-   */
-  private[spark] def orcCompressionCodec: String = getConf(ORC_COMPRESSION, "zlib")
-
   /** The number of rows that will be  */
   private[spark] def columnBatchSize: Int = getConf(COLUMN_BATCH_SIZE, "1000").toInt
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 7bf92b92d6b9e..10331f8c7c854 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -36,8 +36,7 @@ import org.apache.spark.sql.execution._
 import org.apache.spark.sql.execution.SparkStrategies
 import org.apache.spark.sql.json._
 import org.apache.spark.sql.parquet.ParquetRelation
-import org.apache.spark.{Logging, SparkContext}
-import org.apache.spark.sql.orc.OrcRelation
+import org.apache.spark.SparkContext
 
 /**
  * :: AlphaComponent ::
@@ -148,14 +147,6 @@ class SQLContext(@transient val sparkContext: SparkContext)
   def parquetFile(path: String): SchemaRDD =
     new SchemaRDD(this, parquet.ParquetRelation(path, Some(sparkContext.hadoopConfiguration), this))
 
-  /**
-   * Loads a Orc file, returning the result as a [[SchemaRDD]].
-   *
-   * @group userf
-   */
-  def orcFile(path: String): SchemaRDD =
-    new SchemaRDD(this, orc.OrcRelation(path, Some(sparkContext.hadoopConfiguration), this))
-
   /**
    * Loads a JSON file (one object per line), returning the result as a [[SchemaRDD]].
    * It goes through the entire dataset once to determine the schema.
@@ -255,40 +246,6 @@ class SQLContext(@transient val sparkContext: SparkContext)
         path, ScalaReflection.attributesFor[A], allowExisting, conf, this))
   }
 
-  /**
-   * :: Experimental ::
-   * Creates an empty parquet file with the schema of class `A`, which can be registered as a table.
-   * This registered table can be used as the target of future `insertInto` operations.
-   *
-   * {{{
-   *   val sqlContext = new SQLContext(...)
-   *   import sqlContext._
-   *
-   *   case class Person(name: String, age: Int)
-   *   createOrcFile[Person]("path/to/file.orc").registerTempTable("people")
-   *   sql("INSERT INTO people SELECT 'michael', 29")
-   * }}}
-   *
-   * @tparam A A case class type that describes the desired schema of the parquet file to be
-   *           created.
-   * @param path The path where the directory containing parquet metadata should be created.
-   *             Data inserted into this table will also be stored at this location.
-   * @param allowExisting When false, an exception will be thrown if this directory already exists.
-   * @param conf A Hadoop configuration object that can be used to specify options to the parquet
-   *             output format.
-   *
-   * @group userf
-   */
-  @Experimental
-  def createOrcFile[A <: Product : TypeTag](
-      path: String,
-      allowExisting: Boolean = true,
-      conf: Configuration = new Configuration()): SchemaRDD = {
-    new SchemaRDD(
-      this,
-      OrcRelation.createEmpty(path, ScalaReflection.attributesFor[A], allowExisting, conf, this))
-  }
-
   /**
    * Registers the given RDD as a temporary table in the catalog.  Temporary tables exist only
    * during the lifetime of this instance of SQLContext.
@@ -334,7 +291,6 @@ class SQLContext(@transient val sparkContext: SparkContext)
       HashJoin ::
       InMemoryScans ::
       ParquetOperations ::
-      OrcOperations::
       BasicOperators ::
       CartesianProduct ::
       BroadcastNestedLoopJoin :: Nil
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDDLike.scala b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDDLike.scala
index a42ec3d3ea35d..248b7c0d5af2a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDDLike.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDDLike.scala
@@ -18,9 +18,8 @@
 package org.apache.spark.sql
 
 import org.apache.spark.annotation.{DeveloperApi, Experimental}
-import org.apache.spark.sql.catalyst.plans.logical._
-import org.apache.spark.sql.execution.SparkLogicalPlan
 import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
+import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.execution.LogicalRDD
 
 /**
@@ -81,6 +80,7 @@ private[sql] trait SchemaRDDLike {
    * Saves the contents of this `SchemaRDD` as a orc file, preserving the schema.  Files that
    * are written out using this method can be read back in as a SchemaRDD using the `orcFile`
    * function.
+   * Note: you can only use it in HiveContext
    *
    * @group schema
    */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index d04c908c16d2c..cf93d5ad7b503 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -26,7 +26,6 @@ import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.catalyst.types._
 import org.apache.spark.sql.columnar.{InMemoryRelation, InMemoryColumnarTableScan}
 import org.apache.spark.sql.parquet._
-import org.apache.spark.sql.orc.{OrcTableScan, InsertIntoOrcTable, OrcRelation}
 
 private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
   self: SQLContext#SparkPlanner =>
@@ -238,27 +237,6 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
     }
   }
 
-  object OrcOperations extends Strategy {
-    def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
-      case logical.WriteToOrcFile(path, child) =>
-        val relation =
-          OrcRelation.create(path, child, sparkContext.hadoopConfiguration, sqlContext)
-        InsertIntoOrcTable(relation, planLater(child), overwrite=true) :: Nil
-      case logical.InsertIntoOrcTable(table: OrcRelation, partition, child, overwrite) =>
-        InsertIntoOrcTable(table, planLater(child), overwrite) :: Nil
-      case PhysicalOperation(projectList, filters, relation: OrcRelation) =>
-        // TODO: need to implement predict push down.
-        val prunePushedDownFilters = identity[Seq[Expression]] _
-        pruneFilterProject(
-          projectList,
-          filters,
-          prunePushedDownFilters,
-          OrcTableScan(_, relation, None)) :: Nil
-
-      case _ => Nil
-    }
-  }
-
   object InMemoryScans extends Strategy {
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
       case PhysicalOperation(projectList, filters, mem: InMemoryRelation) =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
index ffb732347d30a..eed212cc0819e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
@@ -283,7 +283,7 @@ case class InsertIntoParquetTable(
         1
       } else {
         FileSystemHelper
-          .findMaxTaskId(NewFileOutputFormat.getOutputPath(job).toString, job.getConfiguration) + 1
+          .findMaxTaskId(NewFileOutputFormat.getOutputPath(job).toString, job.getConfiguration, "parquet") + 1
       }
 
     def writeShard(context: TaskContext, iter: Iterator[Row]): Int = {
@@ -488,7 +488,7 @@ private[parquet] object FilteringParquetRowInputFormat {
     .build[FileStatus, Array[BlockLocation]]()
 }
 
-private[parquet] object FileSystemHelper {
+private[sql] object FileSystemHelper {
   def listFiles(pathStr: String, conf: Configuration): Seq[Path] = {
     val origPath = new Path(pathStr)
     val fs = origPath.getFileSystem(conf)
@@ -504,19 +504,41 @@ private[parquet] object FileSystemHelper {
     fs.listStatus(path).map(_.getPath)
   }
 
-    /**
-     * Finds the maximum taskid in the output file names at the given path.
-     */
-  def findMaxTaskId(pathStr: String, conf: Configuration): Int = {
+  /**
+   *  List files with special extension
+   */
+  def listFiles(origPath: Path, conf: Configuration, extension: String): Seq[Path] = {
+    val fs = origPath.getFileSystem(conf)
+    if (fs == null) {
+      throw new IllegalArgumentException(
+        s"OrcTableOperations: Path $origPath is incorrectly formatted")
+    }
+    val path = origPath.makeQualified(fs)
+    if (fs.exists(path) && fs.getFileStatus(path).isDir) {
+      fs.listStatus(path).map(_.getPath).filter(p => p.getName.endsWith(extension))
+    } else {
+      Seq.empty
+    }
+  }
+
+  /**
+   * Finds the maximum taskid in the output file names at the given path.
+   */
+  def findMaxTaskId(pathStr: String, conf: Configuration, extension: String): Int = {
     val files = FileSystemHelper.listFiles(pathStr, conf)
-    // filename pattern is part-r-<int>.parquet
-    val nameP = new scala.util.matching.Regex("""part-r-(\d{1,}).parquet""", "taskid")
+    // filename pattern is part-r-<int>.$extension
+    val nameP = extension match {
+      case "parquet" => new scala.util.matching.Regex( """part-r-(\d{1,}).parquet""", "taskid")
+      case "orc" =>  new scala.util.matching.Regex( """part-r-(\d{1,}).orc""", "taskid")
+      case _ =>
+        sys.error(s"ERROR: unsupported extension: $extension")
+    }
     val hiddenFileP = new scala.util.matching.Regex("_.*")
     files.map(_.getName).map {
       case nameP(taskid) => taskid.toInt
       case hiddenFileP() => 0
       case other: String => {
-        sys.error("ERROR: attempting to append to set of Parquet files and found file" +
+        sys.error(s"ERROR: attempting to append to set of $extension files and found file" +
           s"that does not match name pattern: $other")
         0
       }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/orc/ORCQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/orc/ORCQuerySuite.scala
deleted file mode 100644
index 6daf3b5855f4d..0000000000000
--- a/sql/core/src/test/scala/org/apache/spark/sql/orc/ORCQuerySuite.scala
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.orc
-
-import org.apache.hadoop.fs.Path
-import org.apache.hadoop.hive.ql.io.orc.CompressionKind
-import org.apache.spark.sql.{SQLConf, SchemaRDD, TestData, QueryTest}
-import org.apache.spark.sql.test.TestSQLContext
-import org.scalatest.{BeforeAndAfterAll, FunSuiteLike}
-import org.apache.spark.util.Utils
-import org.apache.spark.sql.catalyst.util.getTempFilePath
-import org.apache.spark.sql.test.TestSQLContext._
-
-import java.io.File
-
-case class TestRDDEntry(key: Int, value: String)
-
-case class NullReflectData(
-    intField: java.lang.Integer,
-    longField: java.lang.Long,
-    floatField: java.lang.Float,
-    doubleField: java.lang.Double,
-    booleanField: java.lang.Boolean)
-
-case class OptionalReflectData(
-    intField: Option[Int],
-    longField: Option[Long],
-    floatField: Option[Float],
-    doubleField: Option[Double],
-    booleanField: Option[Boolean])
-
-case class Nested(i: Int, s: String)
-
-case class Data(array: Seq[Int], nested: Nested)
-
-case class AllDataTypes(
-    stringField: String,
-    intField: Int,
-    longField: Long,
-    floatField: Float,
-    doubleField: Double,
-    shortField: Short,
-    byteField: Byte,
-    booleanField: Boolean)
-
-case class AllDataTypesWithNonPrimitiveType(
-    stringField: String,
-    intField: Int,
-    longField: Long,
-    floatField: Float,
-    doubleField: Double,
-    shortField: Short,
-    byteField: Byte,
-    booleanField: Boolean,
-    array: Seq[Int],
-    arrayContainsNull: Seq[Option[Int]],
-    map: Map[Int, Long],
-    mapValueContainsNull: Map[Int, Option[Long]],
-    data: Data)
-
-case class BinaryData(binaryData: Array[Byte])
-
-class OrcQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterAll {
-  TestData // Load test data tables.
-
-  var testRDD: SchemaRDD = null
-  test("Read/Write All Types") {
-    val tempDir = getTempFilePath("orcTest").getCanonicalPath
-    val range = (0 to 255)
-    val data = sparkContext.parallelize(range)
-      .map(x => AllDataTypes(s"$x", x, x.toLong, x.toFloat, x.toDouble, x.toShort, x.toByte, x % 2 == 0))
-
-    data.saveAsOrcFile(tempDir)
-
-    checkAnswer(
-      orcFile(tempDir),
-      data.toSchemaRDD.collect().toSeq)
-
-    Utils.deleteRecursively(new File(tempDir))
-
-  }
-
-  test("Compression options for writing to a Orcfile") {
-    val defaultOrcCompressionCodec = TestSQLContext.orcCompressionCodec
-    //TODO: support other compress codec
-    val file = getTempFilePath("orcTest")
-    val path = file.toString
-    val rdd = TestSQLContext.sparkContext.parallelize((1 to 100))
-      .map(i => TestRDDEntry(i, s"val_$i"))
-
-    // test default compression codec, now only support zlib
-    rdd.saveAsOrcFile(path)
-    var actualCodec = OrcFileOperator.readMetaData(new Path(path)).getCompression.name
-    assert(actualCodec == TestSQLContext.orcCompressionCodec.toUpperCase)
-
-    /**
-    orcFile(path).registerTempTable("tmp")
-    checkAnswer(
-      sql("SELECT key, value FROM tmp WHERE value = 'val_5' OR value = 'val_7'"),
-      (5, "val_5") ::
-        (7, "val_7") :: Nil)
-
-    Utils.deleteRecursively(file)
-
-    // test uncompressed parquet file with property value "UNCOMPRESSED"
-    TestSQLContext.setConf(SQLConf.ORC_COMPRESSION, "UNCOMPRESSED")
-
-    rdd.saveAsOrcFile(path)
-    actualCodec = OrcFileOperator.readMetaData(new Path(path)).getCompression.name
-    assert(actualCodec == OrcRelation.shortOrcCompressionCodecNames.getOrElse(
-      TestSQLContext.orcCompressionCodec.toUpperCase, CompressionKind.NONE).name.toUpperCase)
-
-    orcFile(path).registerTempTable("tmp")
-    checkAnswer(
-      sql("SELECT key, value FROM tmp WHERE value = 'val_5' OR value = 'val_7'"),
-      (5, "val_5") ::
-        (7, "val_7") :: Nil)
-
-    Utils.deleteRecursively(file)
-
-    // test uncompressed parquet file with property value "none"
-    TestSQLContext.setConf(SQLConf.ORC_COMPRESSION, "none")
-
-    rdd.saveAsOrcFile(path)
-    actualCodec = OrcFileOperator.readMetaData(new Path(path)).getCompression.name()
-    assert(actualCodec ==TestSQLContext.orcCompressionCodec.toUpperCase)
-
-    orcFile(path).registerTempTable("tmp")
-    checkAnswer(
-      sql("SELECT key, value FROM tmp WHERE value = 'val_5' OR value = 'val_7'"),
-      (5, "val_5") ::
-        (7, "val_7") :: Nil)
-
-    Utils.deleteRecursively(file)
-
-    // test gzip compression codec
-    TestSQLContext.setConf(SQLConf.ORC_COMPRESSION, "zlib")
-
-    rdd.saveAsOrcFile(path)
-    actualCodec = OrcFileOperator.readMetaData(new Path(path)).getCompression.name()
-    assert(actualCodec ==TestSQLContext.orcCompressionCodec.toUpperCase)
-
-    orcFile(path).registerTempTable("tmp")
-    checkAnswer(
-      sql("SELECT key, value FROM tmp WHERE value = 'val_5' OR value = 'val_7'"),
-      (5, "val_5") ::
-        (7, "val_7") :: Nil)
-
-    Utils.deleteRecursively(file)
-
-    // test snappy compression codec
-    TestSQLContext.setConf(SQLConf.ORC_COMPRESSION, "snappy")
-
-    rdd.saveAsOrcFile(path)
-    actualCodec = OrcFileOperator.readMetaData(new Path(path)).getCompression.name()
-    assert(actualCodec ==TestSQLContext.orcCompressionCodec.toUpperCase)
-
-    orcFile(path).registerTempTable("tmp")
-    checkAnswer(
-      sql("SELECT key, value FROM tmp WHERE value = 'val_5' OR value = 'val_7'"),
-      (5, "val_5") ::
-        (7, "val_7") :: Nil)
-
-    Utils.deleteRecursively(file)
-
-    TestSQLContext.setConf(SQLConf.ORC_COMPRESSION, defaultOrcCompressionCodec)
-    */
-  }
-}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index 8bcc098bbb620..b959070e4764c 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -25,6 +25,7 @@ import scala.collection.JavaConversions._
 import scala.language.implicitConversions
 import scala.reflect.runtime.universe.{TypeTag, typeTag}
 
+import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.FileSystem
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.hive.conf.HiveConf
@@ -37,14 +38,14 @@ import org.apache.hadoop.hive.serde2.io.TimestampWritable
 
 import org.apache.spark.SparkContext
 import org.apache.spark.rdd.RDD
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.sql.hive.orc.OrcRelation
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.catalyst.analysis.{Analyzer, EliminateAnalysisOperators}
 import org.apache.spark.sql.catalyst.analysis.{OverrideCatalog, OverrideFunctionRegistry}
 import org.apache.spark.sql.catalyst.plans.logical._
-import org.apache.spark.sql.execution.ExtractPythonUdfs
-import org.apache.spark.sql.execution.QueryExecutionException
-import org.apache.spark.sql.execution.{Command => PhysicalCommand}
+import org.apache.spark.sql.execution.{Command => PhysicalCommand, ExtractPythonUdfs, QueryExecutionException}
 import org.apache.spark.sql.hive.execution.DescribeHiveTableCommand
 
 /**
@@ -120,6 +121,48 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
     catalog.createTable("default", tableName, ScalaReflection.attributesFor[A], allowExisting)
   }
 
+  /**
+   * Loads a Orc file, returning the result as a [[SchemaRDD]].
+   *
+   * @group userf
+   */
+  def orcFile(path: String): SchemaRDD =
+    new SchemaRDD(this, orc.OrcRelation(path, Some(sparkContext.hadoopConfiguration), this))
+
+  /**
+   * :: Experimental ::
+   * Creates an empty orc file with the schema of class `A`, which can be registered as a table.
+   * This registered table can be used as the target of future `insertInto` operations.
+   *
+   * {{{
+   *   val sqlContext = new SQLContext(...)
+   *   import sqlContext._
+   *
+   *   case class Person(name: String, age: Int)
+   *   createOrcFile[Person]("path/to/file.orc").registerTempTable("people")
+   *   sql("INSERT INTO people SELECT 'michael', 29")
+   * }}}
+   *
+   * @tparam A A case class type that describes the desired schema of the orc file to be
+   *           created.
+   * @param path The path where the directory containing parquet metadata should be created.
+   *             Data inserted into this table will also be stored at this location.
+   * @param allowExisting When false, an exception will be thrown if this directory already exists.
+   * @param conf A Hadoop configuration object that can be used to specify options to the parquet
+   *             output format.
+   *
+   * @group userf
+   */
+  @Experimental
+  def createOrcFile[A <: Product : TypeTag](
+      path: String,
+      allowExisting: Boolean = true,
+      conf: Configuration = new Configuration()): SchemaRDD = {
+    new SchemaRDD(
+      this,
+      OrcRelation.createEmpty(path, ScalaReflection.attributesFor[A], allowExisting, conf, this))
+  }
+
   /**
    * Analyzes the given table in the current database to generate statistics, which will be
    * used in query optimizations.
@@ -333,6 +376,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
       HiveCommandStrategy(self),
       TakeOrdered,
       ParquetOperations,
+      OrcOperations,
       InMemoryScans,
       ParquetConversion, // Must be before HiveTableScans
       HiveTableScans,
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
index 508d8239c7628..14f63d2ecb0aa 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
@@ -28,6 +28,7 @@ import org.apache.spark.sql.catalyst.types.StringType
 import org.apache.spark.sql.execution.{DescribeCommand, OutputFaker, SparkPlan}
 import org.apache.spark.sql.hive
 import org.apache.spark.sql.hive.execution._
+import org.apache.spark.sql.hive.orc.{InsertIntoOrcTable, OrcTableScan, OrcRelation}
 import org.apache.spark.sql.parquet.ParquetRelation
 import org.apache.spark.sql.{SQLContext, SchemaRDD}
 
@@ -221,4 +222,24 @@ private[hive] trait HiveStrategies {
       case _ => Nil
     }
   }
+
+  object OrcOperations extends Strategy {
+    def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
+      case logical.WriteToOrcFile(path, child) =>
+        val relation =
+          OrcRelation.create(path, child, sparkContext.hadoopConfiguration, sqlContext)
+        InsertIntoOrcTable(relation, planLater(child), overwrite=true) :: Nil
+      case logical.InsertIntoOrcTable(table: OrcRelation, partition, child, overwrite) =>
+        InsertIntoOrcTable(table, planLater(child), overwrite) :: Nil
+      case PhysicalOperation(projectList, filters, relation: OrcRelation) =>
+        // TODO: need to implement predict push down.
+        val prunePushedDownFilters = identity[Seq[Expression]] _
+        pruneFilterProject(
+          projectList,
+          filters,
+          prunePushedDownFilters,
+          OrcTableScan(_, relation, None)) :: Nil
+      case _ => Nil
+    }
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/orc/OrcRelation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
similarity index 79%
rename from sql/core/src/main/scala/org/apache/spark/sql/orc/OrcRelation.scala
rename to sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
index 8b5156c0f3769..5b3460553aa7f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/orc/OrcRelation.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
@@ -15,27 +15,28 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.orc
+package org.apache.spark.sql.hive.orc
+
+import java.util.Properties
+import java.io.IOException
+import org.apache.hadoop.hive.ql.stats.StatsSetupConst
+
+import scala.collection.mutable
 
-import org.apache.hadoop.hive.conf.HiveConf
-import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, LeafNode}
-import org.apache.spark.sql.catalyst.analysis.{UnresolvedException, MultiInstanceRelation}
-import org.apache.spark.sql.catalyst.expressions.Attribute
-import org.apache.spark.sql.catalyst.expressions.AttributeReference
-import org.apache.spark.sql.catalyst.types._
 import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.permission.FsAction
 import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector
 import org.apache.hadoop.hive.ql.io.orc._
-import org.apache.hadoop.mapred.{FileInputFormat => NewFileInputFormat, JobConf}
 import org.apache.hadoop.hive.ql.io.orc.OrcProto.Type.Kind
-import org.apache.hadoop.mapreduce.Job
-import parquet.hadoop.util.ContextUtil
-import java.util.Properties
-import java.io.IOException
-import scala.collection.mutable
+
+import org.apache.spark.sql.parquet.FileSystemHelper
 import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, LeafNode}
+import org.apache.spark.sql.catalyst.analysis.{UnresolvedException, MultiInstanceRelation}
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.catalyst.expressions.AttributeReference
+import org.apache.spark.sql.catalyst.types._
 
 
 private[sql] case class OrcRelation(
@@ -56,9 +57,11 @@ private[sql] case class OrcRelation(
 
   override val output = orcSchema
 
+  override lazy val statistics = Statistics(sizeInBytes = sqlContext.defaultSizeInBytes)
+
   def orcSchema: Seq[Attribute] = {
     val origPath = new Path(path)
-    val reader = OrcFileOperator.readMetaData(origPath)
+    val reader = OrcFileOperator.readMetaData(origPath, conf)
 
     if (null != reader) {
       val inspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector]
@@ -98,9 +101,9 @@ private[sql] case class OrcRelation(
   }
 
   def convertToAttributes(
-     reader: Reader,
-     keys: java.util.List[String],
-     types: java.util.List[Integer]): Seq[Attribute] = {
+      reader: Reader,
+      keys: java.util.List[String],
+      types: java.util.List[Integer]): Seq[Attribute] = {
     val range = 0.until(keys.size())
     range.map {
       i => reader.getTypes.get(types.get(i)).getKind match {
@@ -132,16 +135,6 @@ private[sql] case class OrcRelation(
 }
 
 private[sql] object OrcRelation {
-
-
-  // The orc compression short names
-  val shortOrcCompressionCodecNames = Map(
-    "NONE"         -> CompressionKind.NONE,
-    "UNCOMPRESSED" -> CompressionKind.NONE,
-    "SNAPPY"       -> CompressionKind.SNAPPY,
-    "ZLIB"         -> CompressionKind.ZLIB,
-    "LZO"          -> CompressionKind.LZO)
-
   /**
    * Creates a new OrcRelation and underlying Orcfile for the given LogicalPlan. Note that
    * this is used inside [[org.apache.spark.sql.execution.SparkStrategies]] to
@@ -152,10 +145,11 @@ private[sql] object OrcRelation {
    * @param conf A configuration to be used.
    * @return An empty OrcRelation with inferred metadata.
    */
-  def create(pathString: String,
-             child: LogicalPlan,
-             conf: Configuration,
-             sqlContext: SQLContext): OrcRelation = {
+  def create(
+      pathString: String,
+      child: LogicalPlan,
+      conf: Configuration,
+      sqlContext: SQLContext): OrcRelation = {
     if (!child.resolved) {
       throw new UnresolvedException[LogicalPlan](
         child,
@@ -173,19 +167,20 @@ private[sql] object OrcRelation {
    * @param conf A configuration to be used.
    * @return An empty OrcRelation.
    */
-  def createEmpty(pathString: String,
-                  attributes: Seq[Attribute],
-                  allowExisting: Boolean,
-                  conf: Configuration,
-                  sqlContext: SQLContext): OrcRelation = {
+  def createEmpty(
+      pathString: String,
+      attributes: Seq[Attribute],
+      allowExisting: Boolean,
+      conf: Configuration,
+      sqlContext: SQLContext): OrcRelation = {
     val path = checkPath(pathString, allowExisting, conf)
 
-   /** set compression kind in hive 0.13.1
-    * conf.set(
-    *   HiveConf.ConfVars.OHIVE_ORC_DEFAULT_COMPRESS.varname,
-    *   shortOrcCompressionCodecNames.getOrElse(
-    *    sqlContext.orcCompressionCodec.toUpperCase, CompressionKind.NONE).name)
-    */
+    /** set compression kind in hive 0.13.1
+      * conf.set(
+      *   HiveConf.ConfVars.OHIVE_ORC_DEFAULT_COMPRESS.varname,
+      *   shortOrcCompressionCodecNames.getOrElse(
+      *    sqlContext.orcCompressionCodec.toUpperCase, CompressionKind.NONE).name)
+      */
     val orcRelation = new OrcRelation(path.toString, Some(conf), sqlContext)
 
     orcRelation
@@ -219,20 +214,11 @@ private[sql] object OrcRelation {
 }
 
 private[sql] object OrcFileOperator {
-  final val COMPRESSION: String = "orcfiles.compression"
-
-  /**
-   *
-   * @param origPath
-   * @return
-   */
-  def readMetaData(origPath: Path): Reader = {
-    val job = new Job()
-    val conf = ContextUtil.getConfiguration(job).asInstanceOf[JobConf]
+  def readMetaData(origPath: Path, configuration: Option[Configuration]): Reader = {
+    val conf = configuration.getOrElse(new Configuration())
     val fs: FileSystem = origPath.getFileSystem(conf)
     val orcFiles = FileSystemHelper.listFiles(origPath, conf, ".orc")
     if (orcFiles != Seq.empty) {
-//      NewFileInputFormat.setInputPaths(conf, orcFiles(0))// why set inputpath here
       if (fs.exists(origPath)) {
         OrcFile.createReader(fs, orcFiles(0))
       } else {
@@ -260,4 +246,3 @@ private[sql] object OrcFileOperator {
     }
   }
 }
-
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/orc/OrcTableOperations.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala
similarity index 71%
rename from sql/core/src/main/scala/org/apache/spark/sql/orc/OrcTableOperations.scala
rename to sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala
index bb1f342cc9ee7..b7b6740aa060f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/orc/OrcTableOperations.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala
@@ -16,40 +16,40 @@
  */
 
 
-package org.apache.spark.sql.orc
+package org.apache.spark.sql.hive.orc
 
-import org.apache.spark.sql.execution.{ExistingRdd, LeafNode, UnaryNode, SparkPlan}
-import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.{TaskContext, SerializableWritable}
-import org.apache.spark.rdd.RDD
+import java.io.IOException
+import java.text.SimpleDateFormat
+import java.util.{Locale, Date}
+import scala.collection.JavaConversions._
 
-import _root_.parquet.hadoop.util.ContextUtil
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.mapreduce.lib.output.{FileOutputFormat, FileOutputCommitter}
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
 import org.apache.hadoop.io.{Writable, NullWritable}
 import org.apache.hadoop.mapreduce.{TaskID, TaskAttemptContext, Job}
-
-import org.apache.hadoop.hive.ql.io.orc.{OrcFile, OrcSerde, OrcInputFormat, OrcOutputFormat}
+import org.apache.hadoop.hive.ql.io.orc.{OrcSerde, OrcInputFormat, OrcOutputFormat}
 import org.apache.hadoop.hive.serde2.objectinspector._
 import org.apache.hadoop.hive.serde2.ColumnProjectionUtils
 import org.apache.hadoop.hive.common.`type`.{HiveDecimal, HiveVarchar}
-
-import java.io.IOException
-import java.text.SimpleDateFormat
-import java.util.{Locale, Date}
-import scala.collection.JavaConversions._
 import org.apache.hadoop.mapred.{SparkHadoopMapRedUtil, Reporter, JobConf}
 
+import org.apache.spark.sql.execution._
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.parquet.FileSystemHelper
+import org.apache.spark.{TaskContext, SerializableWritable}
+import org.apache.spark.rdd.RDD
+import org.apache.spark.util.Utils._
+
 /**
  * orc table scan operator. Imports the file that backs the given
- * [[org.apache.spark.sql.orc.OrcRelation]] as a ``RDD[Row]``.
+ * [[org.apache.spark.sql.hive.orc.OrcRelation]] as a ``RDD[Row]``.
  */
 case class OrcTableScan(
-   output: Seq[Attribute],
-   relation: OrcRelation,
-   columnPruningPred: Option[Expression])
+    output: Seq[Attribute],
+    relation: OrcRelation,
+    columnPruningPred: Option[Expression])
   extends LeafNode {
 
   @transient
@@ -98,7 +98,7 @@ case class OrcTableScan(
     val sc = sqlContext.sparkContext
     val job = new Job(sc.hadoopConfiguration)
 
-    val conf: Configuration = ContextUtil.getConfiguration(job)
+    val conf: Configuration = job.getConfiguration
     val fileList = FileSystemHelper.listFiles(relation.path, conf)
 
     // add all paths in the directory but skip "hidden" ones such
@@ -106,14 +106,13 @@ case class OrcTableScan(
     for (path <- fileList if !path.getName.startsWith("_")) {
       FileInputFormat.addInputPath(job, path)
     }
-    val serialConf = sc.broadcast(new SerializableWritable(conf))
 
     setColumnIds(output, relation, conf)
     val inputClass = classOf[OrcInputFormat].asInstanceOf[
       Class[_ <: org.apache.hadoop.mapred.InputFormat[Void, Row]]]
 
     val rowRdd = productToRowRdd(sc.hadoopRDD[Void, Row](
-      serialConf.value.value.asInstanceOf[JobConf], inputClass, classOf[Void], classOf[Row]))
+      conf.asInstanceOf[JobConf], inputClass, classOf[Void], classOf[Row]))
     rowRdd
   }
 
@@ -153,16 +152,16 @@ case class OrcTableScan(
         } else {
           val bufferedIterator = iterator.buffered
           bufferedIterator.map {r =>
-              val values = getFieldValue.map(_(r))
-              new GenericRow(values.map {
-                case n: String if n.toLowerCase == "null" => ""
-                case varchar: HiveVarchar => varchar.getValue
-                case decimal: HiveDecimal =>
-                  BigDecimal(decimal.bigDecimalValue)
-                case null => ""
-                case other => other
-
-              }.toArray)
+            val values = getFieldValue.map(_(r))
+            new GenericRow(values.map {
+              case n: String if n.toLowerCase == "null" => ""
+              case varchar: HiveVarchar => varchar.getValue
+              case decimal: HiveDecimal =>
+                BigDecimal(decimal.bigDecimalValue)
+              case null => ""
+              case other => other
+
+            }.toArray)
           }
         }
     }
@@ -180,10 +179,12 @@ case class OrcTableScan(
 }
 
 /**
- *
- * @param relation
- * @param child
- * @param overwrite
+ * Operator that acts as a sink for queries on RDDs and can be used to
+ * store the output inside a directory of ORC files. This operator
+ * is similar to Hive's INSERT INTO TABLE operation in the sense that
+ * one can choose to either overwrite or append to a directory. Note
+ * that consecutive insertions to the same table must have compatible
+ * (source) schemas.
  */
 private[sql] case class InsertIntoOrcTable(
     relation: OrcRelation,
@@ -193,16 +194,14 @@ private[sql] case class InsertIntoOrcTable(
 
   override def output = child.output
 
-  val intputClass: Class[_] = getInputClass
+  val inputClass = getInputClass.getName
 
   @transient val sc = sqlContext.sparkContext
 
-  val inspector = sc.broadcast(getInspector(intputClass))
-
   @transient lazy val orcSerde = initFieldInfo
 
   private def getInputClass: Class[_] = {
-    val existRdd = child.asInstanceOf[ExistingRdd]
+    val existRdd = child.asInstanceOf[PhysicalRDD]
     val productClass = existRdd.rdd.firstParent.elementClassTag.runtimeClass
     logInfo("productClass is " + productClass)
     val clazz = productClass
@@ -231,7 +230,6 @@ private[sql] case class InsertIntoOrcTable(
     assert(childRdd != null)
 
     val job = new Job(sqlContext.sparkContext.hadoopConfiguration)
-    // TODO: move that to function in object
     val conf = job.getConfiguration
 
     val fspath = new Path(relation.path)
@@ -248,11 +246,14 @@ private[sql] case class InsertIntoOrcTable(
       }
     }
 
-    val existRdd = child.asInstanceOf[ExistingRdd]
+    val existRdd = child.asInstanceOf[PhysicalRDD]
     val parentRdd = existRdd.rdd.firstParent[Product]
-    val writableRdd = parentRdd.map(obj => {
-      orcSerde.serialize(obj, inspector.value)
-    })
+    val writableRdd = parentRdd.mapPartitions { iter =>
+      val objSnspector = ObjectInspectorFactory.getReflectionObjectInspector(
+        getContextOrSparkClassLoader.loadClass(inputClass),
+        ObjectInspectorFactory.ObjectInspectorOptions.JAVA)
+      iter.map(obj => orcSerde.serialize(obj, objSnspector))
+    }
 
     saveAsHadoopFile(writableRdd, relation.rowClass, relation.path, conf)
 
@@ -266,10 +267,10 @@ private[sql] case class InsertIntoOrcTable(
   // .. then we could use the default one and could use [[MutablePair]]
   // instead of ``Tuple2``
   private def saveAsHadoopFile(
-                                rdd: RDD[Writable],
-                                rowClass: Class[_],
-                                path: String,
-                                @transient conf: Configuration) {
+      rdd: RDD[Writable],
+      rowClass: Class[_],
+      path: String,
+      @transient conf: Configuration) {
     val job = new Job(conf)
     val keyType = classOf[Void]
     job.setOutputKeyClass(keyType)
@@ -287,7 +288,7 @@ private[sql] case class InsertIntoOrcTable(
         1
       } else {
         FileSystemHelper
-          .findMaxTaskId(FileOutputFormat.getOutputPath(job).toString, job.getConfiguration) + 1
+          .findMaxTaskId(FileOutputFormat.getOutputPath(job).toString, job.getConfiguration, "orc") + 1
       }
 
     def getWriter(
@@ -346,73 +347,4 @@ private[sql] case class InsertIntoOrcTable(
     sc.runJob(rdd, writeShard _)
     workerAndComitter._1.commitJob(jobTaskContext)
   }
-
 }
-
-/**
- *
- */
-private[orc] object FileSystemHelper {
-  /**
-   *
-   * @param pathStr
-   * @param conf
-   * @return
-   */
-  def listFiles(pathStr: String, conf: Configuration): Seq[Path] = {
-    val origPath = new Path(pathStr)
-    val fs = origPath.getFileSystem(conf)
-    if (fs == null) {
-      throw new IllegalArgumentException(
-        s"OrcTableOperations: Path $origPath is incorrectly formatted")
-    }
-    val path = origPath.makeQualified(fs)
-    if (!fs.exists(path) || !fs.getFileStatus(path).isDir) {
-      Seq.empty
-    } else {
-      fs.listStatus(path).map(_.getPath)
-    }
-  }
-
-  /**
-   *
-   * @param origPath
-   * @param conf
-   * @param extension
-   * @return
-   */
-  def listFiles(origPath: Path, conf: Configuration, extension: String): Seq[Path] = {
-    val fs = origPath.getFileSystem(conf)
-    if (fs == null) {
-      throw new IllegalArgumentException(
-        s"OrcTableOperations: Path $origPath is incorrectly formatted")
-    }
-    val path = origPath.makeQualified(fs)
-    if (fs.exists(path) && fs.getFileStatus(path).isDir) {
-      fs.listStatus(path).map(_.getPath).filter(p => p.getName.endsWith(extension))
-    } else {
-      Seq.empty
-    }
-  }
-
-  /**
-   * Finds the maximum taskid in the output file names at the given path.
-   */
-  def findMaxTaskId(pathStr: String, conf: Configuration): Int = {
-    val files = FileSystemHelper.listFiles(pathStr, conf)
-    // filename pattern is part-r-<int>.orc
-    val nameP = new scala.util.matching.Regex( """part-r-(\d{1,}).orc""", "taskid")
-    val hiddenFileP = new scala.util.matching.Regex("_.*")
-    files.map(_.getName).map {
-      case nameP(taskid) => taskid.toInt
-      case hiddenFileP() => 0
-      case other: String => {
-        sys.error("ERROR: attempting to append to set of Orc files and found file" +
-          s"that does not match name pattern: $other")
-        0
-      }
-      case _ => 0
-    }.reduceLeft((a, b) => if (a < b) b else a)
-  }
-}
-
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/ORCQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/ORCQuerySuite.scala
new file mode 100644
index 0000000000000..3636f8a3e0301
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/ORCQuerySuite.scala
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.orc
+
+import org.apache.spark.{SparkConf, SparkContext}
+import org.scalatest.{BeforeAndAfterAll, FunSuiteLike}
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
+import org.apache.spark.util.Utils
+import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.hive.test.TestHive
+import org.apache.spark.sql.catalyst.util.getTempFilePath
+import org.apache.spark.sql.hive.test.TestHive._
+
+
+import java.io.File
+
+case class TestRDDEntry(key: Int, value: String)
+
+case class AllDataTypes(
+    stringField: String,
+    intField: Int,
+    longField: Long,
+    floatField: Float,
+    doubleField: Double,
+    shortField: Short,
+    byteField: Byte,
+    booleanField: Boolean)
+
+class OrcQuerySuite extends QueryTest with BeforeAndAfterAll {
+
+  test("Read/Write All Types") {
+    val tempDir = getTempFilePath("orcTest").getCanonicalPath
+    val range = (0 to 255)
+    val data = sparkContext.parallelize(range)
+      .map(x => AllDataTypes(s"$x", x, x.toLong, x.toFloat, x.toDouble, x.toShort, x.toByte, x % 2 == 0))
+
+    data.saveAsOrcFile(tempDir)
+
+    checkAnswer(
+      TestHive.orcFile(tempDir),
+      data.toSchemaRDD.collect().toSeq)
+
+    Utils.deleteRecursively(new File(tempDir))
+  }
+
+  test("Compression options for writing to a Orcfile") {
+    val tempDir = getTempFilePath("orcTest").getCanonicalPath
+    val rdd = TestHive.sparkContext.parallelize((1 to 100))
+      .map(i => TestRDDEntry(i, s"val_$i"))
+
+    // test default compression codec, now only support zlib
+    rdd.saveAsOrcFile(tempDir)
+    var actualCodec = OrcFileOperator.readMetaData(new Path(tempDir), Some(new Configuration())).getCompression.name
+    assert(actualCodec == "ZLIB")
+
+    Utils.deleteRecursively(new File(tempDir))
+  }
+}

From 1db30b1d9e9b24fb5bd0933855b65f99f8ae715b Mon Sep 17 00:00:00 2001
From: scwf <wangfei1@huawei.com>
Date: Sat, 4 Oct 2014 15:37:50 +0800
Subject: [PATCH 05/32] fix scala style

---
 .../org/apache/spark/sql/parquet/ParquetTableOperations.scala  | 3 ++-
 .../org/apache/spark/sql/hive/orc/OrcTableOperations.scala     | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
index eed212cc0819e..8fe69bdc6378b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
@@ -283,7 +283,8 @@ case class InsertIntoParquetTable(
         1
       } else {
         FileSystemHelper
-          .findMaxTaskId(NewFileOutputFormat.getOutputPath(job).toString, job.getConfiguration, "parquet") + 1
+          .findMaxTaskId(
+            NewFileOutputFormat.getOutputPath(job).toString, job.getConfiguration, "parquet") + 1
       }
 
     def writeShard(context: TaskContext, iter: Iterator[Row]): Int = {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala
index b7b6740aa060f..4edb83bdbe1ec 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala
@@ -288,7 +288,8 @@ private[sql] case class InsertIntoOrcTable(
         1
       } else {
         FileSystemHelper
-          .findMaxTaskId(FileOutputFormat.getOutputPath(job).toString, job.getConfiguration, "orc") + 1
+          .findMaxTaskId(
+            FileOutputFormat.getOutputPath(job).toString, job.getConfiguration, "orc") + 1
       }
 
     def getWriter(

From 9529d6893c7c5cf9991193a2c78c65b3f67c0445 Mon Sep 17 00:00:00 2001
From: w00228970 <wangfei1@huawei.com>
Date: Wed, 15 Oct 2014 05:52:58 +0800
Subject: [PATCH 06/32] address comments_1

---
 docs/sql-programming-guide.md                 | 123 ++++++++++++++++++
 .../examples/sql/hive/HiveFromSpark.scala     |  10 --
 .../spark/sql/catalyst/dsl/package.scala      |   2 +-
 .../plans/logical/basicOperators.scala        |  25 +---
 .../org/apache/spark/sql/SchemaRDDLike.scala  |   6 +-
 .../spark/sql/execution/SparkStrategies.scala |   3 +-
 .../sql/parquet/ParquetTableOperations.scala  |  16 +--
 .../apache/spark/sql/hive/HiveContext.scala   |   4 +-
 .../spark/sql/hive/HiveStrategies.scala       |   7 +-
 .../sql/hive/logical/hiveOperators.scala      |  27 ++++
 .../spark/sql/hive/orc/OrcRelation.scala      |  29 ++---
 .../spark/sql/hive/orc/ORCQuerySuite.scala    |   2 +-
 12 files changed, 179 insertions(+), 75 deletions(-)
 create mode 100644 sql/hive/src/main/scala/org/apache/spark/sql/hive/logical/hiveOperators.scala

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 368c3d0008b07..368acf68fa6a4 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -597,6 +597,129 @@ Configuration of Parquet can be done using the `setConf` method on SQLContext or
 </tr>
 </table>
 
+## ORC Files
+
+[ORC](https://cwiki.apache.org/confluence/display/Hive/LanguageManual+ORC) is a columnar format that is supported in Hive, it provides a highly efficient way to store data on HDFS to speed up query performance.
+Spark SQL provides support for both reading and writing ORC files that automatically preserves the schema
+of the original data.
+
+### Loading Data Programmatically
+
+Using the data from the above example:
+
+<div class="codetabs">
+
+<div data-lang="scala"  markdown="1">
+
+{% highlight scala %}
+// Use HiveContext to read or write ORC File.
+val sqlContext = new HiveContext(sc)
+import sqlContext._
+val people: RDD[Person] = ... // An RDD of case class objects, from the previous example.
+
+// The RDD is implicitly converted to a SchemaRDD by createSchemaRDD, allowing it to be stored using ORC.
+rdd.registerTempTable("people")
+rdd.saveAsOrcFile("people.orc")
+
+// Read in the ORC file created above.  ORC files are self-describing so the schema is preserved.
+// The result of loading a ORC file is also a SchemaRDD.
+val orcFile = hiveContext.orcFile("pair.orc")
+
+//ORC files can also be registered as tables and then used in SQL statements.
+orcFile.registerTempTable("orcFile")
+val teenagers = sqlContext.sql("SELECT name FROM orcFile WHERE age >= 13 AND age <= 19")
+teenagers.map(t => "Name: " + t(0)).collect().foreach(println)
+{% endhighlight %}
+
+</div>
+
+<div data-lang="java"  markdown="1">
+
+{% highlight java %}
+// Use JavaHiveContext to read or write ORC File.
+JavaHiveContext sqlContext = new org.apache.spark.sql.hive.api.java.HiveContext(sc);
+JavaSchemaRDD schemaPeople = ... // The JavaSchemaRDD from the previous example.
+
+// JavaSchemaRDDs can be saved as ORC files, maintaining the schema information.
+schemaPeople.saveAsOrcFile("people.orc");
+
+// Read in the ORC file created above.  ORC files are self-describing so the schema is preserved.
+// The result of loading a ORC file is also a JavaSchemaRDD.
+JavaSchemaRDD orcFile = sqlContext.orcFile("people.orc");
+
+// ORC files can also be registered as tables and then used in SQL statements.
+orcFile.registerTempTable("orcFile");
+JavaSchemaRDD teenagers = sqlContext.sql("SELECT name FROM orcFile WHERE age >= 13 AND age <= 19");
+List<String> teenagerNames = teenagers.map(new Function<Row, String>() {
+  public String call(Row row) {
+    return "Name: " + row.getString(0);
+  }
+}).collect();
+{% endhighlight %}
+
+</div>
+
+<div data-lang="python"  markdown="1">
+
+{% highlight python %}
+# Use HiveContext to read or write ORC File.
+from pyspark.sql import HiveContext
+sqlContext = HiveContext(sc)
+
+schemaPeople # The SchemaRDD from the previous example.
+
+# SchemaRDDs can be saved as ORC files, maintaining the schema information.
+schemaPeople.saveAsOrcFile("people.orc")
+
+# Read in the ORC file created above. ORC files are self-describing so the schema is preserved.
+# The result of loading a ORC file is also a SchemaRDD.
+orcFile = sqlContext.orcFile("people.orc")
+
+# ORC files can also be registered as tables and then used in SQL statements.
+orcFile.registerTempTable("orcFile");
+teenagers = sqlContext.sql("SELECT name FROM orcFile WHERE age >= 13 AND age <= 19")
+teenNames = teenagers.map(lambda p: "Name: " + p.name)
+for teenName in teenNames.collect():
+  print teenName
+{% endhighlight %}
+
+</div>
+
+</div>
+
+### Configuration
+
+Configuration of ORC can be done using the `setConf` method on HiveContext or by running
+`SET key=value` commands using SQL.
+
+<table class="table">
+<tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
+<tr>
+  <td><code>spark.sql.parquet.binaryAsString</code></td>
+  <td>false</td>
+  <td>
+    Some other Parquet-producing systems, in particular Impala and older versions of Spark SQL, do
+    not differentiate between binary data and strings when writing out the Parquet schema.  This
+    flag tells Spark SQL to interpret binary data as a string to provide compatibility with these systems.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.sql.parquet.cacheMetadata</code></td>
+  <td>false</td>
+  <td>
+    Turns on caching of Parquet schema metadata.  Can speed up querying of static data.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.sql.parquet.compression.codec</code></td>
+  <td>snappy</td>
+  <td>
+    Sets the compression codec use when writing Parquet files. Acceptable values include:
+    uncompressed, snappy, gzip, lzo.
+  </td>
+</tr>
+</table>
+
 ## JSON Datasets
 <div class="codetabs">
 
diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/hive/HiveFromSpark.scala b/examples/src/main/scala/org/apache/spark/examples/sql/hive/HiveFromSpark.scala
index ea958eddb8f8c..e26f213e8afa8 100644
--- a/examples/src/main/scala/org/apache/spark/examples/sql/hive/HiveFromSpark.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/sql/hive/HiveFromSpark.scala
@@ -62,16 +62,6 @@ object HiveFromSpark {
     println("Result of SELECT *:")
     sql("SELECT * FROM records r JOIN src s ON r.key = s.key").collect().foreach(println)
 
-    // Write out an RDD as a orc file.
-    rdd.saveAsOrcFile("pair.orc")
-
-    // Read in orc file. Orc files are self-describing so the schmema is preserved.
-    val orcFile = hiveContext.orcFile("pair.orc")
-
-    // These files can also be registered as tables.
-    orcFile.registerTempTable("orcFile")
-    sql("SELECT * FROM records r JOIN orcFile s ON r.key = s.key").collect().foreach(println)
-
     sc.stop()
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
index deb622c39faf5..5668f09650bf4 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
@@ -267,7 +267,7 @@ package object dsl {
 
   object plans {  // scalastyle:ignore
     implicit class DslLogicalPlan(val logicalPlan: LogicalPlan) extends LogicalPlanFunctions {
-      def writeToFile(path: String) = WriteToFile(path, logicalPlan)
+      def writeToFile(path: String) = WriteToPaquetFile(path, logicalPlan)
     }
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
index 6fd3caf75c5fd..13027cb583fd2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
@@ -114,22 +114,6 @@ case class InsertIntoTable(
   }
 }
 
-case class InsertIntoOrcTable(
-    table: LogicalPlan,
-    partition: Map[String, Option[String]],
-    child: LogicalPlan,
-    overwrite: Boolean)
-  extends LogicalPlan {
-  // The table being inserted into is a child for the purposes of transformations.
-  def children = table :: child :: Nil
-  def references = Set.empty
-  def output = child.output
-
-  override lazy val resolved = childrenResolved && child.output.zip(table.output).forall {
-    case (childAttr, tableAttr) => childAttr.dataType == tableAttr.dataType
-  }
-}
-
 case class CreateTableAsSelect(
     databaseName: Option[String],
     tableName: String,
@@ -138,19 +122,12 @@ case class CreateTableAsSelect(
   override lazy val resolved = (databaseName != None && childrenResolved)
 }
 
-case class WriteToFile(
+case class WriteToPaquetFile(
     path: String,
     child: LogicalPlan) extends UnaryNode {
   override def output = child.output
 }
 
-case class WriteToOrcFile(
-    path: String,
-    child: LogicalPlan) extends UnaryNode {
-  def references = Set.empty
-  def output = child.output
-}
-
 case class Sort(order: Seq[SortOrder], child: LogicalPlan) extends UnaryNode {
   override def output = child.output
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDDLike.scala b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDDLike.scala
index 248b7c0d5af2a..59e49e22645cb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDDLike.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDDLike.scala
@@ -54,7 +54,7 @@ private[sql] trait SchemaRDDLike {
   @transient protected[spark] val logicalPlan: LogicalPlan = baseLogicalPlan match {
     // For various commands (like DDL) and queries with side effects, we force query optimization to
     // happen right away to let these side effects take place eagerly.
-    case _: Command | _: InsertIntoTable | _: CreateTableAsSelect |_: WriteToFile =>
+    case _: Command | _: InsertIntoTable | _: CreateTableAsSelect |_: WriteToPaquetFile =>
       LogicalRDD(queryExecution.analyzed.output, queryExecution.toRdd)(sqlContext)
     case _ =>
       baseLogicalPlan
@@ -73,11 +73,11 @@ private[sql] trait SchemaRDDLike {
    * @group schema
    */
   def saveAsParquetFile(path: String): Unit = {
-    sqlContext.executePlan(WriteToFile(path, logicalPlan)).toRdd
+    sqlContext.executePlan(WriteToPaquetFile(path, logicalPlan)).toRdd
   }
 
   /**
-   * Saves the contents of this `SchemaRDD` as a orc file, preserving the schema.  Files that
+   * Saves the contents of this `SchemaRDD` as a ORC file, preserving the schema.  Files that
    * are written out using this method can be read back in as a SchemaRDD using the `orcFile`
    * function.
    * Note: you can only use it in HiveContext
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index cf93d5ad7b503..47d2b1a21621c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -195,8 +195,7 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
 
   object ParquetOperations extends Strategy {
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
-      // TODO: need to support writing to other types of files.  Unify the below code paths.
-      case logical.WriteToFile(path, child) =>
+      case logical.WriteToPaquetFile(path, child) =>
         val relation =
           ParquetRelation.create(path, child, sparkContext.hadoopConfiguration, sqlContext)
         // Note: overwrite=false because otherwise the metadata we just created will be deleted
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
index 8fe69bdc6378b..a973a3491e70f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
@@ -512,11 +512,11 @@ private[sql] object FileSystemHelper {
     val fs = origPath.getFileSystem(conf)
     if (fs == null) {
       throw new IllegalArgumentException(
-        s"OrcTableOperations: Path $origPath is incorrectly formatted")
+        s"Path $origPath is incorrectly formatted")
     }
     val path = origPath.makeQualified(fs)
     if (fs.exists(path) && fs.getFileStatus(path).isDir) {
-      fs.listStatus(path).map(_.getPath).filter(p => p.getName.endsWith(extension))
+      fs.globStatus(path).map(_.getPath).filter(p => p.getName.endsWith(extension))
     } else {
       Seq.empty
     }
@@ -526,17 +526,13 @@ private[sql] object FileSystemHelper {
    * Finds the maximum taskid in the output file names at the given path.
    */
   def findMaxTaskId(pathStr: String, conf: Configuration, extension: String): Int = {
-    val files = FileSystemHelper.listFiles(pathStr, conf)
     // filename pattern is part-r-<int>.$extension
-    val nameP = extension match {
-      case "parquet" => new scala.util.matching.Regex( """part-r-(\d{1,}).parquet""", "taskid")
-      case "orc" =>  new scala.util.matching.Regex( """part-r-(\d{1,}).orc""", "taskid")
-      case _ =>
-        sys.error(s"ERROR: unsupported extension: $extension")
-    }
+    require(Seq("orc", "parquet").contains(extension), s"Unsupported extension: $extension")
+    val nameP = new scala.util.matching.Regex(s"""part-r-(\d{1,}).$extension""", "taskid")
+    val files = FileSystemHelper.listFiles(pathStr, conf)
     val hiddenFileP = new scala.util.matching.Regex("_.*")
     files.map(_.getName).map {
-      case nameP(taskid) => taskid.toInt
+      case nameP(taskid) => taskid.toInt                                                 r
       case hiddenFileP() => 0
       case other: String => {
         sys.error(s"ERROR: attempting to append to set of $extension files and found file" +
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index b959070e4764c..761b3b3075c9b 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -122,7 +122,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
   }
 
   /**
-   * Loads a Orc file, returning the result as a [[SchemaRDD]].
+   * Loads a ORC file, returning the result as a [[SchemaRDD]].
    *
    * @group userf
    */
@@ -135,7 +135,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
    * This registered table can be used as the target of future `insertInto` operations.
    *
    * {{{
-   *   val sqlContext = new SQLContext(...)
+   *   val sqlContext = new HiveContext(...)
    *   import sqlContext._
    *
    *   case class Person(name: String, age: Int)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
index 14f63d2ecb0aa..a9183714e10e3 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
@@ -22,13 +22,12 @@ import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.GeneratePredicate
 import org.apache.spark.sql.catalyst.planning._
-import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.types.StringType
 import org.apache.spark.sql.execution.{DescribeCommand, OutputFaker, SparkPlan}
 import org.apache.spark.sql.hive
 import org.apache.spark.sql.hive.execution._
-import org.apache.spark.sql.hive.orc.{InsertIntoOrcTable, OrcTableScan, OrcRelation}
+import org.apache.spark.sql.hive.orc.{InsertIntoOrcTable, OrcRelation, OrcTableScan}
 import org.apache.spark.sql.parquet.ParquetRelation
 import org.apache.spark.sql.{SQLContext, SchemaRDD}
 
@@ -228,8 +227,8 @@ private[hive] trait HiveStrategies {
       case logical.WriteToOrcFile(path, child) =>
         val relation =
           OrcRelation.create(path, child, sparkContext.hadoopConfiguration, sqlContext)
-        InsertIntoOrcTable(relation, planLater(child), overwrite=true) :: Nil
-      case logical.InsertIntoOrcTable(table: OrcRelation, partition, child, overwrite) =>
+        InsertIntoOrcTable(relation, planLater(child), overwrite = true) :: Nil
+      case logical.InsertIntoTable(table: OrcRelation, partition, child, overwrite) =>
         InsertIntoOrcTable(table, planLater(child), overwrite) :: Nil
       case PhysicalOperation(projectList, filters, relation: OrcRelation) =>
         // TODO: need to implement predict push down.
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/logical/hiveOperators.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/logical/hiveOperators.scala
new file mode 100644
index 0000000000000..f46d90065544b
--- /dev/null
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/logical/hiveOperators.scala
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.logical
+
+import org.apache.spark.sql.catalyst.plans.logical.{UnaryNode, LogicalPlan}
+
+case class WriteToOrcFile(
+    path: String,
+    child: LogicalPlan) extends UnaryNode {
+  def references = Set.empty
+  def output = child.output
+}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
index 5b3460553aa7f..8062dfe323377 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
@@ -19,8 +19,6 @@ package org.apache.spark.sql.hive.orc
 
 import java.util.Properties
 import java.io.IOException
-import org.apache.hadoop.hive.ql.stats.StatsSetupConst
-
 import scala.collection.mutable
 
 import org.apache.hadoop.fs.{FileSystem, Path}
@@ -29,6 +27,7 @@ import org.apache.hadoop.fs.permission.FsAction
 import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector
 import org.apache.hadoop.hive.ql.io.orc._
 import org.apache.hadoop.hive.ql.io.orc.OrcProto.Type.Kind
+import org.apache.hadoop.hive.ql.stats.StatsSetupConst
 
 import org.apache.spark.sql.parquet.FileSystemHelper
 import org.apache.spark.sql.SQLContext
@@ -38,7 +37,6 @@ import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.expressions.AttributeReference
 import org.apache.spark.sql.catalyst.types._
 
-
 private[sql] case class OrcRelation(
     path: String,
     @transient conf: Option[Configuration],
@@ -59,9 +57,9 @@ private[sql] case class OrcRelation(
 
   override lazy val statistics = Statistics(sizeInBytes = sqlContext.defaultSizeInBytes)
 
-  def orcSchema: Seq[Attribute] = {
+  private def orcSchema: Seq[Attribute] = {
     val origPath = new Path(path)
-    val reader = OrcFileOperator.readMetaData(origPath, conf)
+    val reader = OrcFileOperator.getMetaDataReader(origPath, conf)
 
     if (null != reader) {
       val inspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector]
@@ -74,8 +72,8 @@ private[sql] case class OrcRelation(
       val totalType = reader.getTypes.get(0)
       val keys = totalType.getFieldNamesList
       val types = totalType.getSubtypesList
-      log.info("field name is {}", keys)
-      log.info("types is {}", types)
+      log.info("field names are {}", keys)
+      log.info("types are {}", types)
 
       val colBuff = new StringBuilder
       val typeBuff = new StringBuilder
@@ -214,26 +212,21 @@ private[sql] object OrcRelation {
 }
 
 private[sql] object OrcFileOperator {
-  def readMetaData(origPath: Path, configuration: Option[Configuration]): Reader = {
+  def getMetaDataReader(origPath: Path, configuration: Option[Configuration]): Reader = {
     val conf = configuration.getOrElse(new Configuration())
     val fs: FileSystem = origPath.getFileSystem(conf)
     val orcFiles = FileSystemHelper.listFiles(origPath, conf, ".orc")
-    if (orcFiles != Seq.empty) {
-      if (fs.exists(origPath)) {
-        OrcFile.createReader(fs, orcFiles(0))
-      } else {
-        null
-      }
+    require(orcFiles != Seq.empty, "orcFiles is empty")
+    if (fs.exists(origPath)) {
+      OrcFile.createReader(fs, orcFiles(0))
     } else {
-      null
+      throw new IOException(s"File not found: $origPath")
     }
   }
 
 
   def writeMetaData(attributes: Seq[Attribute], origPath: Path, conf: Configuration) {
-    if (origPath == null) {
-      throw new IllegalArgumentException("Unable to write Parquet metadata: path is null")
-    }
+    require(origPath != null, "Unable to write ORC metadata: path is null")
     val fs = origPath.getFileSystem(conf)
     if (fs == null) {
       throw new IllegalArgumentException(
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/ORCQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/ORCQuerySuite.scala
index 3636f8a3e0301..7d8d642d22620 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/ORCQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/ORCQuerySuite.scala
@@ -66,7 +66,7 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll {
 
     // test default compression codec, now only support zlib
     rdd.saveAsOrcFile(tempDir)
-    var actualCodec = OrcFileOperator.readMetaData(new Path(tempDir), Some(new Configuration())).getCompression.name
+    var actualCodec = OrcFileOperator.getMetaDataReader(new Path(tempDir), Some(new Configuration())).getCompression.name
     assert(actualCodec == "ZLIB")
 
     Utils.deleteRecursively(new File(tempDir))

From 89421ef18117c7ffdd44eff93c9360be785cacea Mon Sep 17 00:00:00 2001
From: scwf <wangfei1@huawei.com>
Date: Tue, 14 Oct 2014 21:15:35 -0700
Subject: [PATCH 07/32] log.info -> logInfo

---
 .../scala/org/apache/spark/sql/hive/orc/OrcRelation.scala  | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
index 8062dfe323377..82a55928c6a51 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
@@ -72,8 +72,8 @@ private[sql] case class OrcRelation(
       val totalType = reader.getTypes.get(0)
       val keys = totalType.getFieldNamesList
       val types = totalType.getSubtypesList
-      log.info("field names are {}", keys)
-      log.info("types are {}", types)
+      logInfo("field names are {}", keys)
+      logInfo("types are {}", types)
 
       val colBuff = new StringBuilder
       val typeBuff = new StringBuilder
@@ -91,6 +91,7 @@ private[sql] case class OrcRelation(
       typeBuff.setLength(typeBuff.length - 1)
       prop.setProperty("columns", colBuff.toString())
       prop.setProperty("columns.types", typeBuff.toString())
+      logInfo(s"columns are ${colBuff}, columns.types are $typeBuff")
       val attributes = convertToAttributes(reader, keys, types)
       attributes
     } else {
@@ -122,7 +123,7 @@ private[sql] case class OrcRelation(
         case Kind.DOUBLE =>
           new AttributeReference(keys.get(i), DoubleType, false)()
         case _ => {
-          log.info("unsupported datatype")
+          logInfo("unsupported datatype")
           null
         }
       }

From 40f9d680ed10ea7783ccd90eb9c8aa2915e3386b Mon Sep 17 00:00:00 2001
From: scwf <wangfei1@huawei.com>
Date: Thu, 16 Oct 2014 15:35:30 -0700
Subject: [PATCH 08/32] address comments

---
 .../org/apache/spark/sql/SchemaRDDLike.scala  |  12 --
 .../sql/parquet/ParquetTableOperations.scala  |   2 +-
 .../spark/sql/hive/HiveStrategies.scala       |   5 +-
 .../sql/hive/logical/hiveOperators.scala      |  27 -----
 .../spark/sql/hive/orc/OrcRelation.scala      | 109 +++---------------
 .../sql/hive/orc/OrcTableOperations.scala     |  80 ++++++++++---
 .../apache/spark/sql/hive/orc/package.scala   |  74 ++++++++++++
 7 files changed, 155 insertions(+), 154 deletions(-)
 delete mode 100644 sql/hive/src/main/scala/org/apache/spark/sql/hive/logical/hiveOperators.scala
 create mode 100644 sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/package.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDDLike.scala b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDDLike.scala
index 59e49e22645cb..5b42011e35743 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDDLike.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDDLike.scala
@@ -76,18 +76,6 @@ private[sql] trait SchemaRDDLike {
     sqlContext.executePlan(WriteToPaquetFile(path, logicalPlan)).toRdd
   }
 
-  /**
-   * Saves the contents of this `SchemaRDD` as a ORC file, preserving the schema.  Files that
-   * are written out using this method can be read back in as a SchemaRDD using the `orcFile`
-   * function.
-   * Note: you can only use it in HiveContext
-   *
-   * @group schema
-   */
-  def saveAsOrcFile(path: String): Unit = {
-    sqlContext.executePlan(WriteToOrcFile(path, logicalPlan)).toRdd
-  }
-
   /**
    * Registers this RDD as a temporary table using the given name.  The lifetime of this temporary
    * table is tied to the [[SQLContext]] that was used to create this SchemaRDD.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
index 12a0b49f9a323..5aec62055592b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
@@ -540,7 +540,7 @@ private[sql] object FileSystemHelper {
     val files = FileSystemHelper.listFiles(pathStr, conf)
     val hiddenFileP = new scala.util.matching.Regex("_.*")
     files.map(_.getName).map {
-      case nameP(taskid) => taskid.toInt                                                 r
+      case nameP(taskid) => taskid.toInt
       case hiddenFileP() => 0
       case other: String => {
         sys.error(s"ERROR: attempting to append to set of $extension files and found file" +
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
index d2ede12523b0a..fdcd8aa58c922 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
@@ -22,12 +22,13 @@ import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.GeneratePredicate
 import org.apache.spark.sql.catalyst.planning._
+import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.types.StringType
 import org.apache.spark.sql.execution.{DescribeCommand, OutputFaker, SparkPlan}
 import org.apache.spark.sql.hive
 import org.apache.spark.sql.hive.execution._
-import org.apache.spark.sql.hive.orc.{InsertIntoOrcTable, OrcRelation, OrcTableScan}
+import org.apache.spark.sql.hive.orc.{WriteToOrcFile, InsertIntoOrcTable, OrcRelation, OrcTableScan}
 import org.apache.spark.sql.parquet.ParquetRelation
 import org.apache.spark.sql.{SQLContext, SchemaRDD}
 
@@ -224,7 +225,7 @@ private[hive] trait HiveStrategies {
 
   object OrcOperations extends Strategy {
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
-      case logical.WriteToOrcFile(path, child) =>
+      case WriteToOrcFile(path, child) =>
         val relation =
           OrcRelation.create(path, child, sparkContext.hadoopConfiguration, sqlContext)
         InsertIntoOrcTable(relation, planLater(child), overwrite = true) :: Nil
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/logical/hiveOperators.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/logical/hiveOperators.scala
deleted file mode 100644
index f46d90065544b..0000000000000
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/logical/hiveOperators.scala
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.hive.logical
-
-import org.apache.spark.sql.catalyst.plans.logical.{UnaryNode, LogicalPlan}
-
-case class WriteToOrcFile(
-    path: String,
-    child: LogicalPlan) extends UnaryNode {
-  def references = Set.empty
-  def output = child.output
-}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
index 82a55928c6a51..c539c45efe420 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
@@ -26,16 +26,14 @@ import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.permission.FsAction
 import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector
 import org.apache.hadoop.hive.ql.io.orc._
-import org.apache.hadoop.hive.ql.io.orc.OrcProto.Type.Kind
-import org.apache.hadoop.hive.ql.stats.StatsSetupConst
 
 import org.apache.spark.sql.parquet.FileSystemHelper
 import org.apache.spark.sql.SQLContext
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, LeafNode}
 import org.apache.spark.sql.catalyst.analysis.{UnresolvedException, MultiInstanceRelation}
 import org.apache.spark.sql.catalyst.expressions.Attribute
-import org.apache.spark.sql.catalyst.expressions.AttributeReference
 import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.sql.hive.HiveMetastoreTypes
 
 private[sql] case class OrcRelation(
     path: String,
@@ -49,85 +47,20 @@ private[sql] case class OrcRelation(
 
   var rowClass: Class[_] = null
 
-  val fieldIdCache: mutable.Map[String, Int] = new mutable.HashMap[String, Int]
-
-  val fieldNameTypeCache: mutable.Map[String, String] = new mutable.HashMap[String, String]
-
   override val output = orcSchema
 
+  // TODO: use statistics in ORC file
   override lazy val statistics = Statistics(sizeInBytes = sqlContext.defaultSizeInBytes)
 
   private def orcSchema: Seq[Attribute] = {
+    // get the schema info through ORC Reader
     val origPath = new Path(path)
     val reader = OrcFileOperator.getMetaDataReader(origPath, conf)
+    val inspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector]
+    // data types that is inspected by this inspector
+    val schema = inspector.getTypeName
 
-    if (null != reader) {
-      val inspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector]
-      val fields = inspector.getAllStructFieldRefs
-
-      if (fields.size() == 0) {
-        return Seq.empty
-      }
-
-      val totalType = reader.getTypes.get(0)
-      val keys = totalType.getFieldNamesList
-      val types = totalType.getSubtypesList
-      logInfo("field names are {}", keys)
-      logInfo("types are {}", types)
-
-      val colBuff = new StringBuilder
-      val typeBuff = new StringBuilder
-      for (i <- 0 until fields.size()) {
-        val fieldName = fields.get(i).getFieldName
-        val typeName = fields.get(i).getFieldObjectInspector.getTypeName
-        colBuff.append(fieldName)
-        fieldNameTypeCache.put(fieldName, typeName)
-        fieldIdCache.put(fieldName, i)
-        colBuff.append(",")
-        typeBuff.append(typeName)
-        typeBuff.append(":")
-      }
-      colBuff.setLength(colBuff.length - 1)
-      typeBuff.setLength(typeBuff.length - 1)
-      prop.setProperty("columns", colBuff.toString())
-      prop.setProperty("columns.types", typeBuff.toString())
-      logInfo(s"columns are ${colBuff}, columns.types are $typeBuff")
-      val attributes = convertToAttributes(reader, keys, types)
-      attributes
-    } else {
-      Seq.empty
-    }
-  }
-
-  def convertToAttributes(
-      reader: Reader,
-      keys: java.util.List[String],
-      types: java.util.List[Integer]): Seq[Attribute] = {
-    val range = 0.until(keys.size())
-    range.map {
-      i => reader.getTypes.get(types.get(i)).getKind match {
-        case Kind.BOOLEAN =>
-          new AttributeReference(keys.get(i), BooleanType, false)()
-        case Kind.STRING =>
-          new AttributeReference(keys.get(i), StringType, true)()
-        case Kind.BYTE =>
-          new AttributeReference(keys.get(i), ByteType, true)()
-        case Kind.SHORT =>
-          new AttributeReference(keys.get(i), ShortType, true)()
-        case Kind.INT =>
-          new AttributeReference(keys.get(i), IntegerType, true)()
-        case Kind.LONG =>
-          new AttributeReference(keys.get(i), LongType, false)()
-        case Kind.FLOAT =>
-          new AttributeReference(keys.get(i), FloatType, false)()
-        case Kind.DOUBLE =>
-          new AttributeReference(keys.get(i), DoubleType, false)()
-        case _ => {
-          logInfo("unsupported datatype")
-          null
-        }
-      }
-    }
+    HiveMetastoreTypes.toDataType(schema).asInstanceOf[StructType].toAttributes
   }
 
   override def newInstance() = OrcRelation(path, conf, sqlContext).asInstanceOf[this.type]
@@ -174,38 +107,26 @@ private[sql] object OrcRelation {
       sqlContext: SQLContext): OrcRelation = {
     val path = checkPath(pathString, allowExisting, conf)
 
-    /** set compression kind in hive 0.13.1
+    /** TODO: set compression kind in hive 0.13.1
       * conf.set(
       *   HiveConf.ConfVars.OHIVE_ORC_DEFAULT_COMPRESS.varname,
       *   shortOrcCompressionCodecNames.getOrElse(
       *    sqlContext.orcCompressionCodec.toUpperCase, CompressionKind.NONE).name)
       */
-    val orcRelation = new OrcRelation(path.toString, Some(conf), sqlContext)
-
-    orcRelation
+    new OrcRelation(path.toString, Some(conf), sqlContext)
   }
 
   private def checkPath(pathStr: String, allowExisting: Boolean, conf: Configuration): Path = {
-    if (pathStr == null) {
-      throw new IllegalArgumentException("Unable to create OrcRelation: path is null")
-    }
+    require(pathStr != null, "Unable to create OrcRelation: path is null")
     val origPath = new Path(pathStr)
     val fs = origPath.getFileSystem(conf)
-    if (fs == null) {
-      throw new IllegalArgumentException(
-        s"Unable to create OrcRelation: incorrectly formatted path $pathStr")
-    }
+    require(fs != null, s"Unable to create OrcRelation: incorrectly formatted path $pathStr")
     val path = origPath.makeQualified(fs)
-    if (!allowExisting && fs.exists(path)) {
-      sys.error(s"File $pathStr already exists.")
+    if (!allowExisting) {
+      require(!fs.exists(path), s"File $pathStr already exists.")
     }
-
-    if (fs.exists(path) &&
-      !fs.getFileStatus(path)
-        .getPermission
-        .getUserAction
-        .implies(FsAction.READ_WRITE)) {
-      throw new IOException(
+    if (fs.exists(path)) {
+      require(fs.getFileStatus(path).getPermission.getUserAction.implies(FsAction.READ_WRITE),
         s"Unable to create OrcRelation: path $path not read-writable")
     }
     path
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala
index 4edb83bdbe1ec..0e8c616fe67ae 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala
@@ -41,6 +41,20 @@ import org.apache.spark.sql.parquet.FileSystemHelper
 import org.apache.spark.{TaskContext, SerializableWritable}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.util.Utils._
+import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, UnaryNode => LogicalUnaryNode}
+import org.apache.spark.sql.execution.UnaryNode
+import org.apache.spark.sql.catalyst.types.StructType
+import org.apache.spark.sql.hive.HiveMetastoreTypes
+import org.apache.hadoop.hive.serde2.typeinfo.{TypeInfoUtils, TypeInfo}
+
+/**
+ * logical plan of writing to ORC file
+ */
+case class WriteToOrcFile(
+    path: String,
+    child: LogicalPlan) extends LogicalUnaryNode {
+  def output = child.output
+}
 
 /**
  * orc table scan operator. Imports the file that backs the given
@@ -107,7 +121,7 @@ case class OrcTableScan(
       FileInputFormat.addInputPath(job, path)
     }
 
-    setColumnIds(output, relation, conf)
+    addColumnIds(output, relation, conf)
     val inputClass = classOf[OrcInputFormat].asInstanceOf[
       Class[_ <: org.apache.hadoop.mapred.InputFormat[Void, Row]]]
 
@@ -117,34 +131,64 @@ case class OrcTableScan(
   }
 
   /**
+   * add column ids and names
    * @param output
    * @param relation
    * @param conf
    */
-  def setColumnIds(output: Seq[Attribute], relation: OrcRelation, conf: Configuration) {
-    val idBuff = new StringBuilder()
+  def addColumnIds(output: Seq[Attribute], relation: OrcRelation, conf: Configuration) {
+    val fieldIdMap = relation.output.zipWithIndex.toMap
 
-    output.map(att => {
+    val ids = output.map(att => {
       val realName = att.name.toLowerCase(Locale.ENGLISH)
-      val id = relation.fieldIdCache.getOrElse(realName, null)
-      if (null != id) {
-        idBuff.append(id)
-        idBuff.append(",")
+      fieldIdMap.getOrElse(realName, -1)
+    }).filter(_ >= 0)
+    if (ids != null && !ids.isEmpty) {
+      ColumnProjectionUtils.appendReadColumnIDs(conf, ids)
+    }
+
+    val names = output.map(_.name)
+    if (names != null && !names.isEmpty) {
+      ColumnProjectionUtils.appendReadColumnNames(conf, names)
+    }
+  }
+
+  // Transform all given raw `Writable`s into `Row`s.
+  def fillObject(
+      iterator: scala.collection.Iterator[Writable],
+      nonPartitionKeyAttrs: Seq[(Attribute, Int)],
+      mutableRow: MutableRow): Iterator[Row] = {
+    val schema =  StructType.fromAttributes(relation.output)
+    val orcSchema = HiveMetastoreTypes.toMetastoreType(schema)
+    val deserializer = new OrcSerde
+    val typeInfo: TypeInfo = TypeInfoUtils.getTypeInfoFromTypeString(orcSchema)
+    val soi = relation.getObjectInspector
+
+    val (fieldRefs, fieldOrdinals) = nonPartitionKeyAttrs.map {
+      case (attr, ordinal) =>
+        soi.getStructFieldRef(attr.name) -> ordinal
+    }.unzip
+
+    val unwrappers = HadoopTypeConverter.unwrappers(fieldRefs)
+    // Map each tuple to a row object
+    iterator.map { value =>
+      val raw = deserializer.deserialize(value)
+      var i = 0
+      while (i < fieldRefs.length) {
+        val fieldValue = soi.getStructFieldData(raw, fieldRefs(i))
+        if (fieldValue == null) {
+          mutableRow.setNullAt(fieldOrdinals(i))
+        } else {
+          unwrappers(i)(fieldValue, mutableRow, fieldOrdinals(i))
+        }
+        i += 1
       }
-    })
-    if (idBuff.length > 0) {
-      idBuff.setLength(idBuff.length - 1)
+      mutableRow: Row
     }
-    conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, idBuff.toString())
   }
 
-  /**
-   *
-   * @param data
-   * @tparam A
-   * @return
-   */
   def productToRowRdd[A <: Product](data: RDD[A]): RDD[Row] = {
+    val mutableRow = new SpecificMutableRow(output.map(_.dataType))
     data.mapPartitions {
       iterator =>
         if (iterator.isEmpty) {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/package.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/package.scala
new file mode 100644
index 0000000000000..254a1b0593904
--- /dev/null
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/package.scala
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive
+
+import com.esotericsoftware.kryo.io.Output
+import com.esotericsoftware.kryo.Kryo
+import org.apache.commons.codec.binary.Base64
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.hive.ql.io.sarg.SearchArgument
+import org.apache.spark.sql.catalyst.ScalaReflection
+import org.apache.spark.sql.SchemaRDD
+import scala.reflect.runtime.universe.TypeTag
+
+package object orc {
+//  implicit class OrcContext(sqlContext: HiveContext) {
+//
+//    def orcFile(filePath: String) =  new SchemaRDD(sqlContext,
+//      OrcRelation(filePath,
+//        Some(sqlContext.sparkContext.hadoopConfiguration), sqlContext))
+//
+//    def createOrcFile[A <: Product : TypeTag](
+//        path: String,
+//        allowExisting: Boolean = true,
+//        conf: Configuration = new Configuration()): SchemaRDD = {
+//      new SchemaRDD(
+//        sqlContext,
+//        OrcRelation.createEmpty(path,
+//          ScalaReflection.attributesFor[A], allowExisting, conf, sqlContext))
+//    }
+//  }
+
+  implicit class OrcSchemaRDD(rdd: SchemaRDD) {
+    /**
+     * Saves the contents of this `SchemaRDD` as a ORC file, preserving the schema.  Files that
+     * are written out using this method can be read back in as a SchemaRDD using the `orcFile`
+     * function.
+     * Note: you can only use it in HiveContext
+     *
+     * @group schema
+     */
+    def saveAsOrcFile(path: String): Unit = {
+      rdd.sqlContext.executePlan(WriteToOrcFile(path,
+        rdd.logicalPlan)).toRdd
+    }
+  }
+
+  // for orc compression type, only take effect in hive 0.13.1
+  val orcDefaultCompressVar = "hive.exec.orc.default.compress"
+  // for prediction push down in hive-0.13.1, don't enable it
+  val ORC_FILTER_PUSHDOWN_ENABLED = false
+  val SARG_PUSHDOWN = "sarg.pushdown";
+
+  def toKryo(input: SearchArgument) = {
+    val out = new Output(4 * 1024, 10 * 1024 * 1024);
+    new Kryo().writeObject(out, input);
+    out.close();
+    Base64.encodeBase64String(out.toBytes());
+  }
+}
\ No newline at end of file

From 349bb92f04f16679bdf28a75f90349dfbe7d3123 Mon Sep 17 00:00:00 2001
From: scwf <wangfei1@huawei.com>
Date: Thu, 16 Oct 2014 17:06:24 -0700
Subject: [PATCH 09/32] reuse TableReader

---
 .../spark/sql/hive/orc/OrcRelation.scala      |  11 +-
 .../sql/hive/orc/OrcTableOperations.scala     | 114 +++---------------
 2 files changed, 24 insertions(+), 101 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
index c539c45efe420..36657db0db63b 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
@@ -19,7 +19,6 @@ package org.apache.spark.sql.hive.orc
 
 import java.util.Properties
 import java.io.IOException
-import scala.collection.mutable
 
 import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.hadoop.conf.Configuration
@@ -35,6 +34,8 @@ import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.types._
 import org.apache.spark.sql.hive.HiveMetastoreTypes
 
+import scala.collection.JavaConversions._
+
 private[sql] case class OrcRelation(
     path: String,
     @transient conf: Option[Configuration],
@@ -60,6 +61,14 @@ private[sql] case class OrcRelation(
     // data types that is inspected by this inspector
     val schema = inspector.getTypeName
 
+    // set prop here, initial OrcSerde need it
+    val fields = inspector.getAllStructFieldRefs
+    val (columns, columnTypes) = fields.map { f =>
+      f.getFieldName -> f.getFieldObjectInspector.getTypeName
+    }.unzip
+    prop.setProperty("columns", columns.mkString(","))
+    prop.setProperty("columns.types", columnTypes.mkString(":"))
+
     HiveMetastoreTypes.toDataType(schema).asInstanceOf[StructType].toAttributes
   }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala
index 0e8c616fe67ae..9cb7ec2e78300 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala
@@ -32,7 +32,6 @@ import org.apache.hadoop.mapreduce.{TaskID, TaskAttemptContext, Job}
 import org.apache.hadoop.hive.ql.io.orc.{OrcSerde, OrcInputFormat, OrcOutputFormat}
 import org.apache.hadoop.hive.serde2.objectinspector._
 import org.apache.hadoop.hive.serde2.ColumnProjectionUtils
-import org.apache.hadoop.hive.common.`type`.{HiveDecimal, HiveVarchar}
 import org.apache.hadoop.mapred.{SparkHadoopMapRedUtil, Reporter, JobConf}
 
 import org.apache.spark.sql.execution._
@@ -43,9 +42,9 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.util.Utils._
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, UnaryNode => LogicalUnaryNode}
 import org.apache.spark.sql.execution.UnaryNode
-import org.apache.spark.sql.catalyst.types.StructType
-import org.apache.spark.sql.hive.HiveMetastoreTypes
-import org.apache.hadoop.hive.serde2.typeinfo.{TypeInfoUtils, TypeInfo}
+import org.apache.spark.sql.hive.HadoopTableReader
+
+import scala.collection.JavaConversions._
 
 /**
  * logical plan of writing to ORC file
@@ -69,45 +68,12 @@ case class OrcTableScan(
   @transient
   lazy val serde: OrcSerde = initSerde
 
-  @transient
-  lazy val getFieldValue: Seq[Product => Any] = {
-    val inspector = serde.getObjectInspector.asInstanceOf[StructObjectInspector]
-    output.map(attr => {
-      val ref = inspector.getStructFieldRef(attr.name.toLowerCase(Locale.ENGLISH))
-      row: Product => {
-        val fieldData = row.productElement(1)
-        val data = inspector.getStructFieldData(fieldData, ref)
-        unwrapData(data, ref.getFieldObjectInspector)
-      }
-    })
-  }
-
   private def initSerde(): OrcSerde = {
     val serde = new OrcSerde
     serde.initialize(null, relation.prop)
     serde
   }
 
-  def unwrapData(data: Any, oi: ObjectInspector): Any = oi match {
-    case pi: PrimitiveObjectInspector => pi.getPrimitiveJavaObject(data)
-    case li: ListObjectInspector =>
-      Option(li.getList(data))
-        .map(_.map(unwrapData(_, li.getListElementObjectInspector)).toSeq)
-        .orNull
-    case mi: MapObjectInspector =>
-      Option(mi.getMap(data)).map(
-        _.map {
-          case (k, v) =>
-            (unwrapData(k, mi.getMapKeyObjectInspector),
-              unwrapData(v, mi.getMapValueObjectInspector))
-        }.toMap).orNull
-    case si: StructObjectInspector =>
-      val allRefs = si.getAllStructFieldRefs
-      new GenericRow(
-        allRefs.map(r =>
-          unwrapData(si.getStructFieldData(data, r), r.getFieldObjectInspector)).toArray)
-  }
-
   override def execute(): RDD[Row] = {
     val sc = sqlContext.sparkContext
     val job = new Job(sc.hadoopConfiguration)
@@ -123,10 +89,16 @@ case class OrcTableScan(
 
     addColumnIds(output, relation, conf)
     val inputClass = classOf[OrcInputFormat].asInstanceOf[
-      Class[_ <: org.apache.hadoop.mapred.InputFormat[Void, Row]]]
+      Class[_ <: org.apache.hadoop.mapred.InputFormat[NullWritable, Writable]]]
 
-    val rowRdd = productToRowRdd(sc.hadoopRDD[Void, Row](
-      conf.asInstanceOf[JobConf], inputClass, classOf[Void], classOf[Row]))
+    // use SpecificMutableRow to decrease GC garbage
+    val mutableRow = new SpecificMutableRow(output.map(_.dataType))
+    val attrsWithIndex = output.zipWithIndex
+    val rowRdd = sc.hadoopRDD[NullWritable, Writable](conf.asInstanceOf[JobConf], inputClass,
+      classOf[NullWritable], classOf[Writable]).map(_._2).mapPartitions { iter =>
+      val deserializer = serde
+      HadoopTableReader.fillObject(iter, deserializer, attrsWithIndex, mutableRow)
+    }
     rowRdd
   }
 
@@ -137,14 +109,14 @@ case class OrcTableScan(
    * @param conf
    */
   def addColumnIds(output: Seq[Attribute], relation: OrcRelation, conf: Configuration) {
-    val fieldIdMap = relation.output.zipWithIndex.toMap
+    val fieldIdMap = relation.output.map(_.name).zipWithIndex.toMap
 
     val ids = output.map(att => {
       val realName = att.name.toLowerCase(Locale.ENGLISH)
       fieldIdMap.getOrElse(realName, -1)
     }).filter(_ >= 0)
     if (ids != null && !ids.isEmpty) {
-      ColumnProjectionUtils.appendReadColumnIDs(conf, ids)
+      ColumnProjectionUtils.appendReadColumnIDs(conf, ids.asInstanceOf[java.util.List[Integer]])
     }
 
     val names = output.map(_.name)
@@ -153,64 +125,6 @@ case class OrcTableScan(
     }
   }
 
-  // Transform all given raw `Writable`s into `Row`s.
-  def fillObject(
-      iterator: scala.collection.Iterator[Writable],
-      nonPartitionKeyAttrs: Seq[(Attribute, Int)],
-      mutableRow: MutableRow): Iterator[Row] = {
-    val schema =  StructType.fromAttributes(relation.output)
-    val orcSchema = HiveMetastoreTypes.toMetastoreType(schema)
-    val deserializer = new OrcSerde
-    val typeInfo: TypeInfo = TypeInfoUtils.getTypeInfoFromTypeString(orcSchema)
-    val soi = relation.getObjectInspector
-
-    val (fieldRefs, fieldOrdinals) = nonPartitionKeyAttrs.map {
-      case (attr, ordinal) =>
-        soi.getStructFieldRef(attr.name) -> ordinal
-    }.unzip
-
-    val unwrappers = HadoopTypeConverter.unwrappers(fieldRefs)
-    // Map each tuple to a row object
-    iterator.map { value =>
-      val raw = deserializer.deserialize(value)
-      var i = 0
-      while (i < fieldRefs.length) {
-        val fieldValue = soi.getStructFieldData(raw, fieldRefs(i))
-        if (fieldValue == null) {
-          mutableRow.setNullAt(fieldOrdinals(i))
-        } else {
-          unwrappers(i)(fieldValue, mutableRow, fieldOrdinals(i))
-        }
-        i += 1
-      }
-      mutableRow: Row
-    }
-  }
-
-  def productToRowRdd[A <: Product](data: RDD[A]): RDD[Row] = {
-    val mutableRow = new SpecificMutableRow(output.map(_.dataType))
-    data.mapPartitions {
-      iterator =>
-        if (iterator.isEmpty) {
-          Iterator.empty
-        } else {
-          val bufferedIterator = iterator.buffered
-          bufferedIterator.map {r =>
-            val values = getFieldValue.map(_(r))
-            new GenericRow(values.map {
-              case n: String if n.toLowerCase == "null" => ""
-              case varchar: HiveVarchar => varchar.getValue
-              case decimal: HiveDecimal =>
-                BigDecimal(decimal.bigDecimalValue)
-              case null => ""
-              case other => other
-
-            }.toArray)
-          }
-        }
-    }
-  }
-
   /**
    * Applies a (candidate) projection.
    *

From 65483e80c2082c0e68a8862d87f8ac78def9cbce Mon Sep 17 00:00:00 2001
From: scwf <wangfei1@huawei.com>
Date: Fri, 17 Oct 2014 02:52:10 -0700
Subject: [PATCH 10/32] fix InsertIntoOrcTable

---
 .../sql/hive/orc/OrcTableOperations.scala     | 89 +++++++++----------
 1 file changed, 40 insertions(+), 49 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala
index 9cb7ec2e78300..1f42a1387dd1e 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala
@@ -21,7 +21,6 @@ package org.apache.spark.sql.hive.orc
 import java.io.IOException
 import java.text.SimpleDateFormat
 import java.util.{Locale, Date}
-import scala.collection.JavaConversions._
 
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.conf.Configuration
@@ -39,12 +38,14 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.parquet.FileSystemHelper
 import org.apache.spark.{TaskContext, SerializableWritable}
 import org.apache.spark.rdd.RDD
-import org.apache.spark.util.Utils._
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, UnaryNode => LogicalUnaryNode}
 import org.apache.spark.sql.execution.UnaryNode
-import org.apache.spark.sql.hive.HadoopTableReader
+import org.apache.spark.sql.hive.{HiveMetastoreTypes, HadoopTableReader}
 
 import scala.collection.JavaConversions._
+import org.apache.spark.sql.catalyst.types.StructType
+import org.apache.hadoop.hive.serde2.typeinfo.{TypeInfoUtils, TypeInfo}
+import org.apache.spark.sql.execution.PhysicalRDD
 
 /**
  * logical plan of writing to ORC file
@@ -79,12 +80,12 @@ case class OrcTableScan(
     val job = new Job(sc.hadoopConfiguration)
 
     val conf: Configuration = job.getConfiguration
-    val fileList = FileSystemHelper.listFiles(relation.path, conf)
-
-    // add all paths in the directory but skip "hidden" ones such
-    // as "_SUCCESS"
-    for (path <- fileList if !path.getName.startsWith("_")) {
-      FileInputFormat.addInputPath(job, path)
+    relation.path.split(",").foreach { curPath =>
+      val qualifiedPath = {
+        val path = new Path(curPath)
+        path.getFileSystem(conf).makeQualified(path)
+      }
+      FileInputFormat.addInputPath(job, qualifiedPath)
     }
 
     addColumnIds(output, relation, conf)
@@ -132,6 +133,7 @@ case class OrcTableScan(
    * @return Pruned TableScan.
    */
   def pruneColumns(prunedAttributes: Seq[Attribute]): OrcTableScan = {
+    // Todo: prune projection
     OrcTableScan(prunedAttributes, relation, columnPruningPred)
   }
 }
@@ -152,31 +154,13 @@ private[sql] case class InsertIntoOrcTable(
 
   override def output = child.output
 
-  val inputClass = getInputClass.getName
-
   @transient val sc = sqlContext.sparkContext
 
-  @transient lazy val orcSerde = initFieldInfo
+  @transient lazy val orcSerde = initSerde
 
-  private def getInputClass: Class[_] = {
-    val existRdd = child.asInstanceOf[PhysicalRDD]
-    val productClass = existRdd.rdd.firstParent.elementClassTag.runtimeClass
-    logInfo("productClass is " + productClass)
-    val clazz = productClass
-    if (null == relation.rowClass) {
-      relation.rowClass = clazz
-    }
-    clazz
-  }
-
-  private def getInspector(clazz: Class[_]): ObjectInspector = {
-    val inspector = ObjectInspectorFactory.getReflectionObjectInspector(clazz,
-      ObjectInspectorFactory.ObjectInspectorOptions.JAVA)
-    inspector
-  }
-
-  private def initFieldInfo(): OrcSerde = {
+  private def initSerde(): OrcSerde = {
     val serde: OrcSerde = new OrcSerde
+    serde.initialize(null, relation.prop)
     serde
   }
 
@@ -203,30 +187,36 @@ private[sql] case class InsertIntoOrcTable(
               + s" to InsertIntoOrcTable:\n${e.toString}")
       }
     }
-
-    val existRdd = child.asInstanceOf[PhysicalRDD]
-    val parentRdd = existRdd.rdd.firstParent[Product]
-    val writableRdd = parentRdd.mapPartitions { iter =>
-      val objSnspector = ObjectInspectorFactory.getReflectionObjectInspector(
-        getContextOrSparkClassLoader.loadClass(inputClass),
-        ObjectInspectorFactory.ObjectInspectorOptions.JAVA)
-      iter.map(obj => orcSerde.serialize(obj, objSnspector))
+    val structType =  StructType.fromAttributes(relation.output)
+    val orcSchema = HiveMetastoreTypes.toMetastoreType(structType)
+
+    val writableRdd = childRdd.mapPartitions { iter =>
+      val typeInfo: TypeInfo =
+        TypeInfoUtils.getTypeInfoFromTypeString(orcSchema)
+      val standardOI = TypeInfoUtils
+        .getStandardJavaObjectInspectorFromTypeInfo(typeInfo)
+        .asInstanceOf[StructObjectInspector]
+      val fieldOIs = standardOI
+        .getAllStructFieldRefs.map(_.getFieldObjectInspector).toArray
+      val outputData = new Array[Any](fieldOIs.length)
+      iter.map { row =>
+        var i = 0
+        while (i < row.length) {
+          outputData(i) = HadoopTableReader.unwrapData(row(i), fieldOIs(i))
+          i = 1
+        }
+        orcSerde.serialize(outputData, standardOI)
+      }
     }
 
-    saveAsHadoopFile(writableRdd, relation.rowClass, relation.path, conf)
+    saveAsHadoopFile(writableRdd, relation.path, conf)
 
     // We return the child RDD to allow chaining (alternatively, one could return nothing).
     childRdd
   }
 
-
-  // based on ``saveAsNewAPIHadoopFile`` in [[PairRDDFunctions]]
-  // TODO: Maybe PairRDDFunctions should use Product2 instead of Tuple2?
-  // .. then we could use the default one and could use [[MutablePair]]
-  // instead of ``Tuple2``
   private def saveAsHadoopFile(
       rdd: RDD[Writable],
-      rowClass: Class[_],
       path: String,
       @transient conf: Configuration) {
     val job = new Job(conf)
@@ -251,11 +241,12 @@ private[sql] case class InsertIntoOrcTable(
       }
 
     def getWriter(
-                   outFormat: OrcOutputFormat,
-                   conf: Configuration,
-                   path: Path,
-                   reporter: Reporter) = {
+        outFormat: OrcOutputFormat,
+        conf: Configuration,
+        path: Path,
+        reporter: Reporter) = {
       val fs = path.getFileSystem(conf)
+
       outFormat.getRecordWriter(fs, conf.asInstanceOf[JobConf], path.toUri.getPath, reporter).
         asInstanceOf[org.apache.hadoop.mapred.RecordWriter[NullWritable, Writable]]
     }

From 629f95ef0f26d2578cff1491b1d4e493243a53d4 Mon Sep 17 00:00:00 2001
From: scwf <wangfei1@huawei.com>
Date: Fri, 17 Oct 2014 22:15:45 -0700
Subject: [PATCH 11/32] address comments

---
 .../scala/org/apache/spark/util/Utils.scala   | 12 ++-
 .../org/apache/spark/sql/SQLContext.scala     |  6 +-
 .../sql/parquet/ParquetTableOperations.scala  |  4 +-
 .../apache/spark/sql/hive/HiveContext.scala   | 99 ++++++++++++-------
 .../spark/sql/hive/orc/OrcRelation.scala      | 61 +++---------
 .../sql/hive/orc/OrcTableOperations.scala     |  8 +-
 .../apache/spark/sql/hive/orc/package.scala   | 28 ++----
 .../spark/sql/hive/orc/ORCQuerySuite.scala    | 11 ++-
 8 files changed, 109 insertions(+), 120 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 53a7512edd852..03a484a6cb9d5 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -659,11 +659,15 @@ private[spark] object Utils extends Logging {
   }
 
   private def listFilesSafely(file: File): Seq[File] = {
-    val files = file.listFiles()
-    if (files == null) {
-      throw new IOException("Failed to list files for dir: " + file)
+    if (file.exists()) {
+      val files = file.listFiles()
+      if (files == null) {
+        throw new IOException("Failed to list files for dir: " + file)
+      }
+      files
+    } else {
+      List()
     }
-    files
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 23e7b2d270777..0a0e351c8ac8d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -149,8 +149,10 @@ class SQLContext(@transient val sparkContext: SparkContext)
    *
    * @group userf
    */
-  def parquetFile(path: String): SchemaRDD =
-    new SchemaRDD(this, parquet.ParquetRelation(path, Some(sparkContext.hadoopConfiguration), this))
+  def parquetFile(path: String): SchemaRDD = {
+    val a = new SchemaRDD(this, parquet.ParquetRelation(path, Some(sparkContext.hadoopConfiguration), this))
+    a
+  }
 
   /**
    * Loads a JSON file (one object per line), returning the result as a [[SchemaRDD]].
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
index 5aec62055592b..fb42c34c29ce9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
@@ -524,7 +524,7 @@ private[sql] object FileSystemHelper {
     }
     val path = origPath.makeQualified(fs)
     if (fs.exists(path) && fs.getFileStatus(path).isDir) {
-      fs.globStatus(path).map(_.getPath).filter(p => p.getName.endsWith(extension))
+      fs.listStatus(path).map(_.getPath).filter(p => p.getName.endsWith(extension))
     } else {
       Seq.empty
     }
@@ -536,7 +536,7 @@ private[sql] object FileSystemHelper {
   def findMaxTaskId(pathStr: String, conf: Configuration, extension: String): Int = {
     // filename pattern is part-r-<int>.$extension
     require(Seq("orc", "parquet").contains(extension), s"Unsupported extension: $extension")
-    val nameP = new scala.util.matching.Regex(s"""part-r-(\d{1,}).$extension""", "taskid")
+    val nameP = new scala.util.matching.Regex(s"""part-r-(\\d{1,}).$extension""", "taskid")
     val files = FileSystemHelper.listFiles(pathStr, conf)
     val hiddenFileP = new scala.util.matching.Regex("_.*")
     files.map(_.getName).map {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index 67c79bc4c8530..d1cbe60853766 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -40,14 +40,20 @@ import org.apache.hadoop.hive.serde2.io.DateWritable
 import org.apache.spark.SparkContext
 import org.apache.spark.rdd.RDD
 import org.apache.spark.annotation.Experimental
-import org.apache.spark.sql.hive.orc.OrcRelation
+import org.apache.spark.sql.hive.orc.{OrcSchemaRDD, OrcRelation}
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.catalyst.analysis.{Analyzer, EliminateAnalysisOperators}
 import org.apache.spark.sql.catalyst.analysis.{OverrideCatalog, OverrideFunctionRegistry}
 import org.apache.spark.sql.catalyst.plans.logical._
-import org.apache.spark.sql.execution.{Command => PhysicalCommand, ExtractPythonUdfs, QueryExecutionException}
+import org.apache.spark.sql.execution.{Command => PhysicalCommand, _}
 import org.apache.spark.sql.hive.execution.DescribeHiveTableCommand
+import org.apache.spark.sql.catalyst.plans.logical.SetCommand
+import scala.Some
+import org.apache.spark.sql.catalyst.plans.logical.NativeCommand
+import org.apache.spark.sql.hive.MetastoreRelation
+import org.apache.spark.sql.hive.execution.DescribeHiveTableCommand
+import org.apache.spark.sql.execution.LogicalRDD
 
 /**
  * DEPRECATED: Use HiveContext instead.
@@ -110,6 +116,25 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
   @deprecated("hql() is deprecated as the sql function now parses using HiveQL by default. " +
              s"The SQL dialect for parsing can be set using ${SQLConf.DIALECT}", "1.1")
   def hql(hqlQuery: String): SchemaRDD = hiveql(hqlQuery)
+  /**
+   * Creates a SchemaRDD from an RDD of case classes.
+   *
+   * @group userf
+   */
+//    implicit def createOrcSchemaRDD(rdd: SchemaRDD) = {
+//      new OrcSchemaRDD(rdd)
+//    }
+
+  /**
+   * Creates a SchemaRDD from an RDD of case classes.
+   *
+   * @group userf
+   */
+  implicit override def createSchemaRDD[A <: Product: TypeTag](rdd: RDD[A]) = {
+    SparkPlan.currentContext.set(self)
+    new OrcSchemaRDD(this,
+      LogicalRDD(ScalaReflection.attributesFor[A], RDDConversions.productToRowRdd(rdd))(self))
+  }
 
   /**
    * Creates a table using the schema of the given class.
@@ -128,41 +153,41 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
    * @group userf
    */
   def orcFile(path: String): SchemaRDD =
-    new SchemaRDD(this, orc.OrcRelation(path, Some(sparkContext.hadoopConfiguration), this))
-
-  /**
-   * :: Experimental ::
-   * Creates an empty orc file with the schema of class `A`, which can be registered as a table.
-   * This registered table can be used as the target of future `insertInto` operations.
-   *
-   * {{{
-   *   val sqlContext = new HiveContext(...)
-   *   import sqlContext._
-   *
-   *   case class Person(name: String, age: Int)
-   *   createOrcFile[Person]("path/to/file.orc").registerTempTable("people")
-   *   sql("INSERT INTO people SELECT 'michael', 29")
-   * }}}
-   *
-   * @tparam A A case class type that describes the desired schema of the orc file to be
-   *           created.
-   * @param path The path where the directory containing parquet metadata should be created.
-   *             Data inserted into this table will also be stored at this location.
-   * @param allowExisting When false, an exception will be thrown if this directory already exists.
-   * @param conf A Hadoop configuration object that can be used to specify options to the parquet
-   *             output format.
-   *
-   * @group userf
-   */
-  @Experimental
-  def createOrcFile[A <: Product : TypeTag](
-      path: String,
-      allowExisting: Boolean = true,
-      conf: Configuration = new Configuration()): SchemaRDD = {
-    new SchemaRDD(
-      this,
-      OrcRelation.createEmpty(path, ScalaReflection.attributesFor[A], allowExisting, conf, this))
-  }
+    new SchemaRDD(this, orc.OrcRelation(Seq.empty, path, Some(sparkContext.hadoopConfiguration), this))
+
+//  /**
+//   * :: Experimental ::
+//   * Creates an empty orc file with the schema of class `A`, which can be registered as a table.
+//   * This registered table can be used as the target of future `insertInto` operations.
+//   *
+//   * {{{
+//   *   val sqlContext = new HiveContext(...)
+//   *   import sqlContext._
+//   *
+//   *   case class Person(name: String, age: Int)
+//   *   createOrcFile[Person]("path/to/file.orc").registerTempTable("people")
+//   *   sql("INSERT INTO people SELECT 'michael', 29")
+//   * }}}
+//   *
+//   * @tparam A A case class type that describes the desired schema of the orc file to be
+//   *           created.
+//   * @param path The path where the directory containing parquet metadata should be created.
+//   *             Data inserted into this table will also be stored at this location.
+//   * @param allowExisting When false, an exception will be thrown if this directory already exists.
+//   * @param conf A Hadoop configuration object that can be used to specify options to the parquet
+//   *             output format.
+//   *
+//   * @group userf
+//   */
+//  @Experimental
+//  def createOrcFile[A <: Product : TypeTag](
+//      path: String,
+//      allowExisting: Boolean = true,
+//      conf: Configuration = new Configuration()): SchemaRDD = {
+//    new SchemaRDD(
+//      this,
+//      OrcRelation.createEmpty(path, ScalaReflection.attributesFor[A], allowExisting, conf, this))
+//  }
 
   /**
    * Analyzes the given table in the current database to generate statistics, which will be
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
index 36657db0db63b..20e82e8746900 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
@@ -37,6 +37,7 @@ import org.apache.spark.sql.hive.HiveMetastoreTypes
 import scala.collection.JavaConversions._
 
 private[sql] case class OrcRelation(
+    attributes: Seq[Attribute],
     path: String,
     @transient conf: Option[Configuration],
     @transient sqlContext: SQLContext,
@@ -46,9 +47,7 @@ private[sql] case class OrcRelation(
 
   val prop: Properties = new Properties
 
-  var rowClass: Class[_] = null
-
-  override val output = orcSchema
+  override lazy val output = attributes ++ orcSchema
 
   // TODO: use statistics in ORC file
   override lazy val statistics = Statistics(sizeInBytes = sqlContext.defaultSizeInBytes)
@@ -57,6 +56,10 @@ private[sql] case class OrcRelation(
     // get the schema info through ORC Reader
     val origPath = new Path(path)
     val reader = OrcFileOperator.getMetaDataReader(origPath, conf)
+    if (reader == null) {
+      // return empty seq when saveAsOrcFile
+      return Seq.empty
+    }
     val inspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector]
     // data types that is inspected by this inspector
     val schema = inspector.getTypeName
@@ -72,7 +75,8 @@ private[sql] case class OrcRelation(
     HiveMetastoreTypes.toDataType(schema).asInstanceOf[StructType].toAttributes
   }
 
-  override def newInstance() = OrcRelation(path, conf, sqlContext).asInstanceOf[this.type]
+  override def newInstance() =
+    OrcRelation(attributes, path, conf, sqlContext).asInstanceOf[this.type]
 }
 
 private[sql] object OrcRelation {
@@ -96,33 +100,8 @@ private[sql] object OrcRelation {
         child,
         "Attempt to create Orc table from unresolved child")
     }
-    createEmpty(pathString, child.output, false, conf, sqlContext)
-  }
-
-  /**
-   * Creates an empty OrcRelation and underlying Orcfile that only
-   * consists of the Metadata for the given schema.
-   *
-   * @param pathString The directory the Orcfile will be stored in.
-   * @param attributes The schema of the relation.
-   * @param conf A configuration to be used.
-   * @return An empty OrcRelation.
-   */
-  def createEmpty(
-      pathString: String,
-      attributes: Seq[Attribute],
-      allowExisting: Boolean,
-      conf: Configuration,
-      sqlContext: SQLContext): OrcRelation = {
-    val path = checkPath(pathString, allowExisting, conf)
-
-    /** TODO: set compression kind in hive 0.13.1
-      * conf.set(
-      *   HiveConf.ConfVars.OHIVE_ORC_DEFAULT_COMPRESS.varname,
-      *   shortOrcCompressionCodecNames.getOrElse(
-      *    sqlContext.orcCompressionCodec.toUpperCase, CompressionKind.NONE).name)
-      */
-    new OrcRelation(path.toString, Some(conf), sqlContext)
+    val path = checkPath(pathString, false, conf)
+    new OrcRelation(child.output, path.toString, Some(conf), sqlContext)
   }
 
   private def checkPath(pathStr: String, allowExisting: Boolean, conf: Configuration): Path = {
@@ -147,26 +126,14 @@ private[sql] object OrcFileOperator {
     val conf = configuration.getOrElse(new Configuration())
     val fs: FileSystem = origPath.getFileSystem(conf)
     val orcFiles = FileSystemHelper.listFiles(origPath, conf, ".orc")
-    require(orcFiles != Seq.empty, "orcFiles is empty")
+    if (orcFiles == Seq.empty) {
+      // should return null when write to orc file
+      return null
+    }
     if (fs.exists(origPath)) {
       OrcFile.createReader(fs, orcFiles(0))
     } else {
       throw new IOException(s"File not found: $origPath")
     }
   }
-
-
-  def writeMetaData(attributes: Seq[Attribute], origPath: Path, conf: Configuration) {
-    require(origPath != null, "Unable to write ORC metadata: path is null")
-    val fs = origPath.getFileSystem(conf)
-    if (fs == null) {
-      throw new IllegalArgumentException(
-        s"Unable to write Orc metadata: path $origPath is incorrectly formatted")
-    }
-
-    val path = origPath.makeQualified(fs)
-    if (fs.exists(path) && !fs.getFileStatus(path).isDir) {
-      throw new IllegalArgumentException(s"Expected to write to directory $path but found file")
-    }
-  }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala
index 1f42a1387dd1e..a036355f65722 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala
@@ -45,7 +45,6 @@ import org.apache.spark.sql.hive.{HiveMetastoreTypes, HadoopTableReader}
 import scala.collection.JavaConversions._
 import org.apache.spark.sql.catalyst.types.StructType
 import org.apache.hadoop.hive.serde2.typeinfo.{TypeInfoUtils, TypeInfo}
-import org.apache.spark.sql.execution.PhysicalRDD
 
 /**
  * logical plan of writing to ORC file
@@ -115,9 +114,9 @@ case class OrcTableScan(
     val ids = output.map(att => {
       val realName = att.name.toLowerCase(Locale.ENGLISH)
       fieldIdMap.getOrElse(realName, -1)
-    }).filter(_ >= 0)
+    }).filter(_ >= 0).map(_.asInstanceOf[Integer])
     if (ids != null && !ids.isEmpty) {
-      ColumnProjectionUtils.appendReadColumnIDs(conf, ids.asInstanceOf[java.util.List[Integer]])
+      ColumnProjectionUtils.appendReadColumnIDs(conf, ids)
     }
 
     val names = output.map(_.name)
@@ -188,6 +187,7 @@ private[sql] case class InsertIntoOrcTable(
       }
     }
     val structType =  StructType.fromAttributes(relation.output)
+    // get Type String to build typeInfo
     val orcSchema = HiveMetastoreTypes.toMetastoreType(structType)
 
     val writableRdd = childRdd.mapPartitions { iter =>
@@ -203,7 +203,7 @@ private[sql] case class InsertIntoOrcTable(
         var i = 0
         while (i < row.length) {
           outputData(i) = HadoopTableReader.unwrapData(row(i), fieldOIs(i))
-          i = 1
+          i += 1
         }
         orcSerde.serialize(outputData, standardOI)
       }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/package.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/package.scala
index 254a1b0593904..e81db568ef43c 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/package.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/package.scala
@@ -23,28 +23,15 @@ import org.apache.commons.codec.binary.Base64
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.hive.ql.io.sarg.SearchArgument
 import org.apache.spark.sql.catalyst.ScalaReflection
-import org.apache.spark.sql.SchemaRDD
+import org.apache.spark.sql.{SQLContext, SchemaRDD}
 import scala.reflect.runtime.universe.TypeTag
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 
 package object orc {
-//  implicit class OrcContext(sqlContext: HiveContext) {
-//
-//    def orcFile(filePath: String) =  new SchemaRDD(sqlContext,
-//      OrcRelation(filePath,
-//        Some(sqlContext.sparkContext.hadoopConfiguration), sqlContext))
-//
-//    def createOrcFile[A <: Product : TypeTag](
-//        path: String,
-//        allowExisting: Boolean = true,
-//        conf: Configuration = new Configuration()): SchemaRDD = {
-//      new SchemaRDD(
-//        sqlContext,
-//        OrcRelation.createEmpty(path,
-//          ScalaReflection.attributesFor[A], allowExisting, conf, sqlContext))
-//    }
-//  }
-
-  implicit class OrcSchemaRDD(rdd: SchemaRDD) {
+  class OrcSchemaRDD(
+       @transient val sqlContext1: SQLContext,
+       @transient val baseLogicalPlan1: LogicalPlan)
+    extends SchemaRDD(sqlContext1, baseLogicalPlan1) {
     /**
      * Saves the contents of this `SchemaRDD` as a ORC file, preserving the schema.  Files that
      * are written out using this method can be read back in as a SchemaRDD using the `orcFile`
@@ -54,8 +41,7 @@ package object orc {
      * @group schema
      */
     def saveAsOrcFile(path: String): Unit = {
-      rdd.sqlContext.executePlan(WriteToOrcFile(path,
-        rdd.logicalPlan)).toRdd
+      sqlContext.executePlan(WriteToOrcFile(path, logicalPlan)).toRdd
     }
   }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/ORCQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/ORCQuerySuite.scala
index 7d8d642d22620..743225bdf6104 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/ORCQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/ORCQuerySuite.scala
@@ -51,12 +51,11 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll {
       .map(x => AllDataTypes(s"$x", x, x.toLong, x.toFloat, x.toDouble, x.toShort, x.toByte, x % 2 == 0))
 
     data.saveAsOrcFile(tempDir)
-
     checkAnswer(
       TestHive.orcFile(tempDir),
       data.toSchemaRDD.collect().toSeq)
 
-    Utils.deleteRecursively(new File(tempDir))
+//    Utils.deleteRecursively(new File(tempDir))
   }
 
   test("Compression options for writing to a Orcfile") {
@@ -69,6 +68,12 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll {
     var actualCodec = OrcFileOperator.getMetaDataReader(new Path(tempDir), Some(new Configuration())).getCompression.name
     assert(actualCodec == "ZLIB")
 
-    Utils.deleteRecursively(new File(tempDir))
+//    Utils.deleteRecursively(new File(tempDir))
+  }
+
+  test("Orc metadata reader") {
+    val path = new Path("/private/var/folders/q7/whr53mh905l1xm_zqgk00j3m0000gn/T/orcTest5484580860941213910")
+    val reader = OrcFileOperator.getMetaDataReader(path, Some(TestHive.sparkContext.hadoopConfiguration))
+    reader
   }
 }

From d1bba2315d7aedf09707fe597329196ef1b7daeb Mon Sep 17 00:00:00 2001
From: scwf <wangfei1@huawei.com>
Date: Fri, 17 Oct 2014 23:19:00 -0700
Subject: [PATCH 12/32] added test for read orc schema

---
 .../sql/parquet/ParquetTableOperations.scala  |  5 +-
 .../spark/sql/hive/orc/OrcRelation.scala      | 58 +++++++++----------
 .../apache/spark/sql/hive/orc/package.scala   | 16 +----
 .../spark/sql/hive/orc/ORCQuerySuite.scala    | 30 ++++++----
 4 files changed, 51 insertions(+), 58 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
index fb42c34c29ce9..90b00bf9e26ef 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
@@ -497,7 +497,7 @@ private[parquet] object FilteringParquetRowInputFormat {
     .build[FileStatus, Array[BlockLocation]]()
 }
 
-private[sql] object FileSystemHelper {
+private[sql] object FileSystemHelper extends Logging{
   def listFiles(pathStr: String, conf: Configuration): Seq[Path] = {
     val origPath = new Path(pathStr)
     val fs = origPath.getFileSystem(conf)
@@ -523,9 +523,10 @@ private[sql] object FileSystemHelper {
         s"Path $origPath is incorrectly formatted")
     }
     val path = origPath.makeQualified(fs)
-    if (fs.exists(path) && fs.getFileStatus(path).isDir) {
+    if (fs.exists(path)) {
       fs.listStatus(path).map(_.getPath).filter(p => p.getName.endsWith(extension))
     } else {
+      println(s"${path.toString} not exists")
       Seq.empty
     }
   }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
index 20e82e8746900..79d267adb14e9 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
@@ -35,6 +35,7 @@ import org.apache.spark.sql.catalyst.types._
 import org.apache.spark.sql.hive.HiveMetastoreTypes
 
 import scala.collection.JavaConversions._
+import org.apache.spark.Logging
 
 private[sql] case class OrcRelation(
     attributes: Seq[Attribute],
@@ -47,34 +48,11 @@ private[sql] case class OrcRelation(
 
   val prop: Properties = new Properties
 
-  override lazy val output = attributes ++ orcSchema
+  override lazy val output = attributes ++ OrcFileOperator.orcSchema(path, conf, prop)
 
   // TODO: use statistics in ORC file
   override lazy val statistics = Statistics(sizeInBytes = sqlContext.defaultSizeInBytes)
 
-  private def orcSchema: Seq[Attribute] = {
-    // get the schema info through ORC Reader
-    val origPath = new Path(path)
-    val reader = OrcFileOperator.getMetaDataReader(origPath, conf)
-    if (reader == null) {
-      // return empty seq when saveAsOrcFile
-      return Seq.empty
-    }
-    val inspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector]
-    // data types that is inspected by this inspector
-    val schema = inspector.getTypeName
-
-    // set prop here, initial OrcSerde need it
-    val fields = inspector.getAllStructFieldRefs
-    val (columns, columnTypes) = fields.map { f =>
-      f.getFieldName -> f.getFieldObjectInspector.getTypeName
-    }.unzip
-    prop.setProperty("columns", columns.mkString(","))
-    prop.setProperty("columns.types", columnTypes.mkString(":"))
-
-    HiveMetastoreTypes.toDataType(schema).asInstanceOf[StructType].toAttributes
-  }
-
   override def newInstance() =
     OrcRelation(attributes, path, conf, sqlContext).asInstanceOf[this.type]
 }
@@ -121,7 +99,7 @@ private[sql] object OrcRelation {
   }
 }
 
-private[sql] object OrcFileOperator {
+private[sql] object OrcFileOperator{
   def getMetaDataReader(origPath: Path, configuration: Option[Configuration]): Reader = {
     val conf = configuration.getOrElse(new Configuration())
     val fs: FileSystem = origPath.getFileSystem(conf)
@@ -130,10 +108,32 @@ private[sql] object OrcFileOperator {
       // should return null when write to orc file
       return null
     }
-    if (fs.exists(origPath)) {
-      OrcFile.createReader(fs, orcFiles(0))
-    } else {
-      throw new IOException(s"File not found: $origPath")
+    OrcFile.createReader(fs, orcFiles(0))
+  }
+
+  def orcSchema(
+      path: String,
+      conf: Option[Configuration],
+      prop: Properties): Seq[Attribute] = {
+    // get the schema info through ORC Reader
+    val origPath = new Path(path)
+    val reader = getMetaDataReader(origPath, conf)
+    if (reader == null) {
+      // return empty seq when saveAsOrcFile
+      return Seq.empty
     }
+    val inspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector]
+    // data types that is inspected by this inspector
+    val schema = inspector.getTypeName
+
+    // set prop here, initial OrcSerde need it
+    val fields = inspector.getAllStructFieldRefs
+    val (columns, columnTypes) = fields.map { f =>
+      f.getFieldName -> f.getFieldObjectInspector.getTypeName
+    }.unzip
+    prop.setProperty("columns", columns.mkString(","))
+    prop.setProperty("columns.types", columnTypes.mkString(":"))
+
+    HiveMetastoreTypes.toDataType(schema).asInstanceOf[StructType].toAttributes
   }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/package.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/package.scala
index e81db568ef43c..06b2d0e3aee3a 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/package.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/package.scala
@@ -17,14 +17,7 @@
 
 package org.apache.spark.sql.hive
 
-import com.esotericsoftware.kryo.io.Output
-import com.esotericsoftware.kryo.Kryo
-import org.apache.commons.codec.binary.Base64
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.hive.ql.io.sarg.SearchArgument
-import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.{SQLContext, SchemaRDD}
-import scala.reflect.runtime.universe.TypeTag
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 
 package object orc {
@@ -49,12 +42,5 @@ package object orc {
   val orcDefaultCompressVar = "hive.exec.orc.default.compress"
   // for prediction push down in hive-0.13.1, don't enable it
   val ORC_FILTER_PUSHDOWN_ENABLED = false
-  val SARG_PUSHDOWN = "sarg.pushdown";
-
-  def toKryo(input: SearchArgument) = {
-    val out = new Output(4 * 1024, 10 * 1024 * 1024);
-    new Kryo().writeObject(out, input);
-    out.close();
-    Base64.encodeBase64String(out.toBytes());
-  }
+  val SARG_PUSHDOWN = "sarg.pushdown"
 }
\ No newline at end of file
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/ORCQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/ORCQuerySuite.scala
index 743225bdf6104..0d883bb9b4000 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/ORCQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/ORCQuerySuite.scala
@@ -17,17 +17,16 @@
 
 package org.apache.spark.sql.hive.orc
 
-import org.apache.spark.{SparkConf, SparkContext}
-import org.scalatest.{BeforeAndAfterAll, FunSuiteLike}
+import java.util.Properties
+import org.scalatest.BeforeAndAfterAll
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
-import org.apache.spark.util.Utils
 import org.apache.spark.sql.QueryTest
 import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.catalyst.util.getTempFilePath
 import org.apache.spark.sql.hive.test.TestHive._
-
-
+import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.util.Utils
 import java.io.File
 
 case class TestRDDEntry(key: Int, value: String)
@@ -55,7 +54,7 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll {
       TestHive.orcFile(tempDir),
       data.toSchemaRDD.collect().toSeq)
 
-//    Utils.deleteRecursively(new File(tempDir))
+    Utils.deleteRecursively(new File(tempDir))
   }
 
   test("Compression options for writing to a Orcfile") {
@@ -65,15 +64,22 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll {
 
     // test default compression codec, now only support zlib
     rdd.saveAsOrcFile(tempDir)
-    var actualCodec = OrcFileOperator.getMetaDataReader(new Path(tempDir), Some(new Configuration())).getCompression.name
+    val actualCodec = OrcFileOperator.getMetaDataReader(new Path(tempDir), Some(new Configuration())).getCompression.name
     assert(actualCodec == "ZLIB")
 
-//    Utils.deleteRecursively(new File(tempDir))
+    Utils.deleteRecursively(new File(tempDir))
   }
 
-  test("Orc metadata reader") {
-    val path = new Path("/private/var/folders/q7/whr53mh905l1xm_zqgk00j3m0000gn/T/orcTest5484580860941213910")
-    val reader = OrcFileOperator.getMetaDataReader(path, Some(TestHive.sparkContext.hadoopConfiguration))
-    reader
+  test("Get ORC Schema with ORC Reader") {
+    val path = "sql/hive/src/test/resources/data/files/orcfiles"
+    val attributes = OrcFileOperator.orcSchema(path, Some(TestHive.sparkContext.hadoopConfiguration), new Properties())
+    assert(attributes(0).dataType == StringType)
+    assert(attributes(1).dataType == IntegerType)
+    assert(attributes(2).dataType == LongType)
+    assert(attributes(3).dataType == FloatType)
+    assert(attributes(4).dataType == DoubleType)
+    assert(attributes(5).dataType == ShortType)
+    assert(attributes(6).dataType == ByteType)
+    assert(attributes(7).dataType == BooleanType)
   }
 }

From b46ecb1cf6336cd2b20b24d893bd660f43d51814 Mon Sep 17 00:00:00 2001
From: scwf <wangfei1@huawei.com>
Date: Fri, 17 Oct 2014 23:27:52 -0700
Subject: [PATCH 13/32] add test files

---
 .../data/files/orcfiles/.part-r-1.orc.crc        | Bin 0 -> 20 bytes
 .../data/files/orcfiles/.part-r-2.orc.crc        | Bin 0 -> 20 bytes
 .../test/resources/data/files/orcfiles/_SUCCESS  |   0
 .../resources/data/files/orcfiles/part-r-1.orc   | Bin 0 -> 1296 bytes
 .../resources/data/files/orcfiles/part-r-2.orc   | Bin 0 -> 1286 bytes
 5 files changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 sql/hive/src/test/resources/data/files/orcfiles/.part-r-1.orc.crc
 create mode 100644 sql/hive/src/test/resources/data/files/orcfiles/.part-r-2.orc.crc
 create mode 100755 sql/hive/src/test/resources/data/files/orcfiles/_SUCCESS
 create mode 100755 sql/hive/src/test/resources/data/files/orcfiles/part-r-1.orc
 create mode 100755 sql/hive/src/test/resources/data/files/orcfiles/part-r-2.orc

diff --git a/sql/hive/src/test/resources/data/files/orcfiles/.part-r-1.orc.crc b/sql/hive/src/test/resources/data/files/orcfiles/.part-r-1.orc.crc
new file mode 100644
index 0000000000000000000000000000000000000000..048f7346c2c05e08ebe5a11aed78665f40e3e378
GIT binary patch
literal 20
bcmYc;N@ieSU}BgvvBaS1*w4n6U^Z_6Ku`wu

literal 0
HcmV?d00001

diff --git a/sql/hive/src/test/resources/data/files/orcfiles/.part-r-2.orc.crc b/sql/hive/src/test/resources/data/files/orcfiles/.part-r-2.orc.crc
new file mode 100644
index 0000000000000000000000000000000000000000..52810863f2ae9a45c8f00f8d022cbe68369ea23b
GIT binary patch
literal 20
bcmYc;N@ieSU}BK>5DB}x^|_y9#fBvSHcbY&

literal 0
HcmV?d00001

diff --git a/sql/hive/src/test/resources/data/files/orcfiles/_SUCCESS b/sql/hive/src/test/resources/data/files/orcfiles/_SUCCESS
new file mode 100755
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/data/files/orcfiles/part-r-1.orc b/sql/hive/src/test/resources/data/files/orcfiles/part-r-1.orc
new file mode 100755
index 0000000000000000000000000000000000000000..3188e14c1fa482780268cdff5acb8ce8c2e5609e
GIT binary patch
literal 1296
zcmeYda^`1X;9?bG=4fElVPJSH^fW1zfl*PekIhZBOwWy3QO(<%K^Le*go~MhfkB8D
zs7{E3gF)aQqeMeJP6;cZQ9+NBk`e@h9%Lj;ne;5BfyvRJQsJJX(4iae85ju3<1`qj
zgeuUD0$eOWH!=X-*TAU7%*E)Sz{nume~^*ck%K8ZD!}1)zE0ddr871gqR#2aF)X?u
z(37yChUH{Zu*u<v*R(}eukwEC<(s?TZS(5gSKod4@``hjm+r|fx7X=2^tPv3%jL%j
zEV~pqCuP%Tw!<r5UaT-O-kr~%k)N|ZLx0QhKXz{x+pI|5aWZ<xq2K#|#J`+uwfeHv
zdSUC$!H1jP@BYBUofEL&;2#EF1_pM<1CdM&oD2*Hq?y4~Jp&Vn2c&K=GKd~A)M#O3
zXnR<1Ai-lU!P6$q!*-aLt+`0A`Cviw!Gs+L6AbPo7)a=OPG91EN%iHV63<;Kv6Ila
zHv8^9=WhDHlr1J?$%(*4ZUz!8iicb}gw!1;iDVk3oSeAaVWLo~R>;c0<&KjT#8`B<
zTypCuvwET8BYyXbz{2TvzdDVM7$_f+OzkNY@-a9m;<;>Q;xv)sGn*Y|xlW5vJIu9u
z&BWBXyk;ji9yPOmax=BRmP@AKgrIAm<-;jL>2r!5XL_&OvFs4<_B(-3=dsJJ{CLDz
z|4QZ4-g2p$4=2Ta_x*fm{*8SGBZKcrb3a)I0oRM+yOt~{aBS3R5pd%0@%uM_&c>6H
z+l;(tt5$M~{1@>le)dxB{`>z2jAcGHC(Atd_bZsR-{NTIxq@lMbDlGa|D5d|^NcHe
z=dsefg2~Z$p7H%JIPma9Qdw@=`Lu7j&km&V<vu%+)|dP2NSa^nvomSua-SVa)60E!
zD(&B`Gy5kxi_PA7Ei&hsTl&VC(Q}Wcy(<9<Tn7p`0tMWG0`q_Zl^_+aK!I?ez&(%v
zNQE;{z<=)1e=>E9&`4xvWN2VuWMXDvWn<^y<l^Sx<>MC+6cQE@6%&_`l#-T_m6KOc
zR8m$^Ra4i{)Y8_`)zddHG%_|ZH8Z!cw6eCbwX=6{baHlab#wRd^z!!c_45x13<?ej
z4GWKmjEatljf+o6OiE5kO-s+n%*xKm&C4$+EGjN3Ei136tg5c5t*d8cVBlS~je)`U
z%t77;0}j>;PSFZ;q=bd{Sgw(_n6Y?AlZfRPhkNIjyLx7b1kY`fRa|D8Th?Z~^o^zH
z?WwGP{`1ByXn)uw|7gRm&yJhLl}f4=%%(_hd#!)uS6fg`!OTr16YVP)86GEy-8igx
z^z=cUg7&~i_N$%dxG^sd;$!a@U{@A4=GK<BZWk#p^bAaktBOj?^6L_DPmAMhIXZJP
zzwi^U(<l8~xQdrQnXdM9iIZxV@u%*DSCh587aX$j{Q1*zWoKo=JRxCrYXM_sLvDf1
zOoq||#*Bu_0>%<122qB^44iTb3y+*TnxSwcB{TCV_Y=;-0t1de$x|{`xIfoo*co(b
z4o`prhZ_G8YY!)h-?AM6jrZC36vDe_c3(W0*d))uv7b=@ScWwSC^0ecG<{)a_78Fv
F0RZVK>0kf=

literal 0
HcmV?d00001

diff --git a/sql/hive/src/test/resources/data/files/orcfiles/part-r-2.orc b/sql/hive/src/test/resources/data/files/orcfiles/part-r-2.orc
new file mode 100755
index 0000000000000000000000000000000000000000..525488226e2a975260e903d3b70e7b39c182cc0c
GIT binary patch
literal 1286
zcma)+do<K}7{<RpL*}9}jr++dnjNQQ#H`({lXKiDlVqgbB>P*9Az3SGt3A%l#59vj
zk!hJBx5%}Kq}j+~jKL7yT(UL{<4$Zba+w_`yMK1}ug`hU^PKm2&ilvfO?A-*0Ih*X
zAX(5?0LW5@f`S2L%DA!3ow#ow-GzWLr4xL80a@kJM<W0Lcx{yxk4Cc8a2p6imQw9A
zlA~%PRTdNk<ETTS@U2ui4aBKyI$5uqymi8NC?W?T@INe{39wSt()&~kyIBi`R6;n&
zP|1RbDg}pDR}}_$q)IYJpdm*Z1oV>Ib~HF@HThRHgeb2E%d7~b?mMC@jZH1_7~5TS
zidGX-w~WJ8t!CVS#p2vtSAl0~aUsWlY9cq!E%V5ei9&ARM?MTU^A8pB>6{btKKMs*
zhF47=oqK9YI5gBOpp*;coRf39d5)%6^-A`-S-4jwNZ5`Cm2%seeh2aRLUtBoCU~B7
z_ZXEzxPLyXbuHeqC?HBosEBDly6AEy*qqYe?o+;aEN^7qqKxXd8wEfE69F)rV5zD7
z%WDwWE{Z#z4FS+?HSqoZ^yAWUhuWH&;M4srWf$k|Y>}}?acrXq_Dy%ApOPwDjDC)d
zHxq;>loSdgI0Cw0);7B1o30oPsk$|YLVDB@WpS^%J?!kg549J12H<?JjR*bu%r{|~
zv%mD6NgrJ5JDanP{dOWn#4$eiiY&>BeDP&b*0~<L@l%n{iOiLOV5Op$SuUK)n2r!m
zrVY#qr?XdgU7ENl*;sD&?Xq8PqEyr^wVa%<TW<8oiCGeV_cmr()YWZp7y_13iVqQJ
zfyRSQo(1f6sP;jZbAhX8fh$_$b@b#**BHN*P92_D9(g$_#NiWnGt3e@hZhYU)(`7v
zipI!{6#;2KS9*GF$>u~jHeI5Sg;ez!?@L}7+2Yg|<y-9SqE*@j?2_f=v;{>A?OSTM
z$lW4yE9lc)=fU{z?g$95SXW}>*jG8+Yw-z*N!M>Ar`$~C-TEypoiE79yq%Sulbd%Z
zzu<0RQE^FW*}ePa6%Q(_9#%i9c`W?>Np0QJ`ac>Pn?%jx7D;PcyR@V8Sy%V-7cXDE
z?&<C8e>3ps;9qZthDT(hWAgEd$*Jj?*}3_53yXih|FE>ISoyfRw!Wd%0APD*Jpf)E
z6fDacsowoUJEc4+zRh*;vY(X<vXARc-Geb0_2Q&GYxi}YcWIiVhex0+u#c}AKZP&o
zQ4ENyOrBnUET6`GWhuXObRmVyNsJLy?zsI!UTAa=jmP9Mvc?T^As~bG`4UsHw2}0V
z<1<1kb53<f<WsSl$v91djt$0Q=K-$n#dC*z=z+ib)5DG==z7otQEah*r>!&UDQ)v+
zqoa#P)6K>ToQVlGi;3_Q&zzFmWPfO7#jYg&GMtUq(L8{&L69-Hhic@VI2(vez}Xmn
zZS7Bv1}KI}Ub9#n9!fK94yT2o^>HI3p)g({Aw7SWgYtER_L>ko+rTyyHWvQqR3G;R
ze_{9uVu!i4)))d68MPLo3*S;`pEuA<sStOFCx)yWwE^<w=6_(ZaC_8%R_uEO!kg-%
F`wyC5PE-H@

literal 0
HcmV?d00001


From f68bea00113a6bce7ca95c946302002605147d96 Mon Sep 17 00:00:00 2001
From: scwf <wangfei1@huawei.com>
Date: Fri, 17 Oct 2014 23:44:57 -0700
Subject: [PATCH 14/32] scala style fixed

---
 .../org/apache/spark/sql/SQLContext.scala     |  6 +-
 .../apache/spark/sql/hive/HiveContext.scala   | 60 +++----------------
 .../spark/sql/hive/orc/OrcRelation.scala      |  8 +--
 .../sql/hive/orc/OrcTableOperations.scala     |  6 +-
 .../apache/spark/sql/hive/orc/package.scala   |  2 +-
 5 files changed, 15 insertions(+), 67 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 0a0e351c8ac8d..23e7b2d270777 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -149,10 +149,8 @@ class SQLContext(@transient val sparkContext: SparkContext)
    *
    * @group userf
    */
-  def parquetFile(path: String): SchemaRDD = {
-    val a = new SchemaRDD(this, parquet.ParquetRelation(path, Some(sparkContext.hadoopConfiguration), this))
-    a
-  }
+  def parquetFile(path: String): SchemaRDD =
+    new SchemaRDD(this, parquet.ParquetRelation(path, Some(sparkContext.hadoopConfiguration), this))
 
   /**
    * Loads a JSON file (one object per line), returning the result as a [[SchemaRDD]].
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index d1cbe60853766..0f7d5a7fd92bc 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -23,9 +23,9 @@ import java.util.{ArrayList => JArrayList}
 
 import scala.collection.JavaConversions._
 import scala.language.implicitConversions
-import scala.reflect.runtime.universe.{TypeTag, typeTag}
+import scala.reflect.runtime.universe.TypeTag
+import scala.Some
 
-import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.FileSystem
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.hive.conf.HiveConf
@@ -39,21 +39,17 @@ import org.apache.hadoop.hive.serde2.io.DateWritable
 
 import org.apache.spark.SparkContext
 import org.apache.spark.rdd.RDD
-import org.apache.spark.annotation.Experimental
-import org.apache.spark.sql.hive.orc.{OrcSchemaRDD, OrcRelation}
+import org.apache.spark.sql.hive.orc.OrcSchemaRDD
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.catalyst.analysis.{Analyzer, EliminateAnalysisOperators}
 import org.apache.spark.sql.catalyst.analysis.{OverrideCatalog, OverrideFunctionRegistry}
 import org.apache.spark.sql.catalyst.plans.logical._
-import org.apache.spark.sql.execution.{Command => PhysicalCommand, _}
-import org.apache.spark.sql.hive.execution.DescribeHiveTableCommand
+import org.apache.spark.sql.execution._
+import org.apache.spark.sql.execution.{Command => PhysicalCommand}
 import org.apache.spark.sql.catalyst.plans.logical.SetCommand
-import scala.Some
 import org.apache.spark.sql.catalyst.plans.logical.NativeCommand
-import org.apache.spark.sql.hive.MetastoreRelation
 import org.apache.spark.sql.hive.execution.DescribeHiveTableCommand
-import org.apache.spark.sql.execution.LogicalRDD
 
 /**
  * DEPRECATED: Use HiveContext instead.
@@ -116,14 +112,6 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
   @deprecated("hql() is deprecated as the sql function now parses using HiveQL by default. " +
              s"The SQL dialect for parsing can be set using ${SQLConf.DIALECT}", "1.1")
   def hql(hqlQuery: String): SchemaRDD = hiveql(hqlQuery)
-  /**
-   * Creates a SchemaRDD from an RDD of case classes.
-   *
-   * @group userf
-   */
-//    implicit def createOrcSchemaRDD(rdd: SchemaRDD) = {
-//      new OrcSchemaRDD(rdd)
-//    }
 
   /**
    * Creates a SchemaRDD from an RDD of case classes.
@@ -152,42 +140,8 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
    *
    * @group userf
    */
-  def orcFile(path: String): SchemaRDD =
-    new SchemaRDD(this, orc.OrcRelation(Seq.empty, path, Some(sparkContext.hadoopConfiguration), this))
-
-//  /**
-//   * :: Experimental ::
-//   * Creates an empty orc file with the schema of class `A`, which can be registered as a table.
-//   * This registered table can be used as the target of future `insertInto` operations.
-//   *
-//   * {{{
-//   *   val sqlContext = new HiveContext(...)
-//   *   import sqlContext._
-//   *
-//   *   case class Person(name: String, age: Int)
-//   *   createOrcFile[Person]("path/to/file.orc").registerTempTable("people")
-//   *   sql("INSERT INTO people SELECT 'michael', 29")
-//   * }}}
-//   *
-//   * @tparam A A case class type that describes the desired schema of the orc file to be
-//   *           created.
-//   * @param path The path where the directory containing parquet metadata should be created.
-//   *             Data inserted into this table will also be stored at this location.
-//   * @param allowExisting When false, an exception will be thrown if this directory already exists.
-//   * @param conf A Hadoop configuration object that can be used to specify options to the parquet
-//   *             output format.
-//   *
-//   * @group userf
-//   */
-//  @Experimental
-//  def createOrcFile[A <: Product : TypeTag](
-//      path: String,
-//      allowExisting: Boolean = true,
-//      conf: Configuration = new Configuration()): SchemaRDD = {
-//    new SchemaRDD(
-//      this,
-//      OrcRelation.createEmpty(path, ScalaReflection.attributesFor[A], allowExisting, conf, this))
-//  }
+  def orcFile(path: String): SchemaRDD = new SchemaRDD(
+    this, orc.OrcRelation(Seq.empty, path, Some(sparkContext.hadoopConfiguration), this))
 
   /**
    * Analyzes the given table in the current database to generate statistics, which will be
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
index 79d267adb14e9..1c453f9ecd2bd 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
@@ -18,24 +18,21 @@
 package org.apache.spark.sql.hive.orc
 
 import java.util.Properties
-import java.io.IOException
-
 import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.permission.FsAction
 import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector
 import org.apache.hadoop.hive.ql.io.orc._
 
-import org.apache.spark.sql.parquet.FileSystemHelper
-import org.apache.spark.sql.SQLContext
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, LeafNode}
 import org.apache.spark.sql.catalyst.analysis.{UnresolvedException, MultiInstanceRelation}
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.types._
 import org.apache.spark.sql.hive.HiveMetastoreTypes
+import org.apache.spark.sql.parquet.FileSystemHelper
+import org.apache.spark.sql.SQLContext
 
 import scala.collection.JavaConversions._
-import org.apache.spark.Logging
 
 private[sql] case class OrcRelation(
     attributes: Seq[Attribute],
@@ -125,7 +122,6 @@ private[sql] object OrcFileOperator{
     val inspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector]
     // data types that is inspected by this inspector
     val schema = inspector.getTypeName
-
     // set prop here, initial OrcSerde need it
     val fields = inspector.getAllStructFieldRefs
     val (columns, columnTypes) = fields.map { f =>
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala
index a036355f65722..0d6655f257602 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala
@@ -28,10 +28,11 @@ import org.apache.hadoop.mapreduce.lib.output.{FileOutputFormat, FileOutputCommi
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
 import org.apache.hadoop.io.{Writable, NullWritable}
 import org.apache.hadoop.mapreduce.{TaskID, TaskAttemptContext, Job}
+import org.apache.hadoop.mapred.{SparkHadoopMapRedUtil, Reporter, JobConf}
 import org.apache.hadoop.hive.ql.io.orc.{OrcSerde, OrcInputFormat, OrcOutputFormat}
 import org.apache.hadoop.hive.serde2.objectinspector._
 import org.apache.hadoop.hive.serde2.ColumnProjectionUtils
-import org.apache.hadoop.mapred.{SparkHadoopMapRedUtil, Reporter, JobConf}
+import org.apache.hadoop.hive.serde2.typeinfo.{TypeInfoUtils, TypeInfo}
 
 import org.apache.spark.sql.execution._
 import org.apache.spark.sql.catalyst.expressions._
@@ -39,12 +40,11 @@ import org.apache.spark.sql.parquet.FileSystemHelper
 import org.apache.spark.{TaskContext, SerializableWritable}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, UnaryNode => LogicalUnaryNode}
+import org.apache.spark.sql.catalyst.types.StructType
 import org.apache.spark.sql.execution.UnaryNode
 import org.apache.spark.sql.hive.{HiveMetastoreTypes, HadoopTableReader}
 
 import scala.collection.JavaConversions._
-import org.apache.spark.sql.catalyst.types.StructType
-import org.apache.hadoop.hive.serde2.typeinfo.{TypeInfoUtils, TypeInfo}
 
 /**
  * logical plan of writing to ORC file
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/package.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/package.scala
index 06b2d0e3aee3a..cf2a3b87a3dfb 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/package.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/package.scala
@@ -43,4 +43,4 @@ package object orc {
   // for prediction push down in hive-0.13.1, don't enable it
   val ORC_FILTER_PUSHDOWN_ENABLED = false
   val SARG_PUSHDOWN = "sarg.pushdown"
-}
\ No newline at end of file
+}

From b06e33504c30eea973832459a6cc68c85c14b888 Mon Sep 17 00:00:00 2001
From: scwf <wangfei1@huawei.com>
Date: Fri, 17 Oct 2014 23:49:47 -0700
Subject: [PATCH 15/32] revert no related changes

---
 .../scala/org/apache/spark/util/Utils.scala   | 12 +++----
 docs/sql-programming-guide.md                 | 33 -------------------
 2 files changed, 4 insertions(+), 41 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 03a484a6cb9d5..53a7512edd852 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -659,15 +659,11 @@ private[spark] object Utils extends Logging {
   }
 
   private def listFilesSafely(file: File): Seq[File] = {
-    if (file.exists()) {
-      val files = file.listFiles()
-      if (files == null) {
-        throw new IOException("Failed to list files for dir: " + file)
-      }
-      files
-    } else {
-      List()
+    val files = file.listFiles()
+    if (files == null) {
+      throw new IOException("Failed to list files for dir: " + file)
     }
+    files
   }
 
   /**
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 368acf68fa6a4..233fe1225a43e 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -687,39 +687,6 @@ for teenName in teenNames.collect():
 
 </div>
 
-### Configuration
-
-Configuration of ORC can be done using the `setConf` method on HiveContext or by running
-`SET key=value` commands using SQL.
-
-<table class="table">
-<tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
-<tr>
-  <td><code>spark.sql.parquet.binaryAsString</code></td>
-  <td>false</td>
-  <td>
-    Some other Parquet-producing systems, in particular Impala and older versions of Spark SQL, do
-    not differentiate between binary data and strings when writing out the Parquet schema.  This
-    flag tells Spark SQL to interpret binary data as a string to provide compatibility with these systems.
-  </td>
-</tr>
-<tr>
-  <td><code>spark.sql.parquet.cacheMetadata</code></td>
-  <td>false</td>
-  <td>
-    Turns on caching of Parquet schema metadata.  Can speed up querying of static data.
-  </td>
-</tr>
-<tr>
-  <td><code>spark.sql.parquet.compression.codec</code></td>
-  <td>snappy</td>
-  <td>
-    Sets the compression codec use when writing Parquet files. Acceptable values include:
-    uncompressed, snappy, gzip, lzo.
-  </td>
-</tr>
-</table>
-
 ## JSON Datasets
 <div class="codetabs">
 

From 4d0950b198027850a9caf0bb292fad570d87cf5c Mon Sep 17 00:00:00 2001
From: scwf <wangfei1@huawei.com>
Date: Sat, 18 Oct 2014 01:14:37 -0700
Subject: [PATCH 16/32] more tests

---
 .../sql/parquet/ParquetTableOperations.scala  |   3 +-
 .../spark/sql/hive/orc/ORCQuerySuite.scala    | 146 +++++++++++++++++-
 2 files changed, 145 insertions(+), 4 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
index e3069e4772953..e52a5eea7a44b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
@@ -497,7 +497,7 @@ private[parquet] object FilteringParquetRowInputFormat {
     .build[FileStatus, Array[BlockLocation]]()
 }
 
-private[sql] object FileSystemHelper extends Logging{
+private[sql] object FileSystemHelper {
   def listFiles(pathStr: String, conf: Configuration): Seq[Path] = {
     val origPath = new Path(pathStr)
     val fs = origPath.getFileSystem(conf)
@@ -526,7 +526,6 @@ private[sql] object FileSystemHelper extends Logging{
     if (fs.exists(path)) {
       fs.listStatus(path).map(_.getPath).filter(p => p.getName.endsWith(extension))
     } else {
-      println(s"${path.toString} not exists")
       Seq.empty
     }
   }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/ORCQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/ORCQuerySuite.scala
index 0d883bb9b4000..c439ef532d0f5 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/ORCQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/ORCQuerySuite.scala
@@ -28,9 +28,32 @@ import org.apache.spark.sql.hive.test.TestHive._
 import org.apache.spark.sql.catalyst.types._
 import org.apache.spark.util.Utils
 import java.io.File
+import org.apache.hadoop.hive.ql.io.orc.CompressionKind
 
 case class TestRDDEntry(key: Int, value: String)
 
+case class NullReflectData(
+    intField: java.lang.Integer,
+    longField: java.lang.Long,
+    floatField: java.lang.Float,
+    doubleField: java.lang.Double,
+    booleanField: java.lang.Boolean)
+
+case class OptionalReflectData(
+    intField: Option[Int],
+    longField: Option[Long],
+    floatField: Option[Float],
+    doubleField: Option[Double],
+    booleanField: Option[Boolean])
+
+case class Nested(i: Int, s: String)
+
+case class Data(array: Seq[Int], nested: Nested)
+
+case class Contact(name: String, phone: String)
+
+case class Person(name: String, age: Int, contacts: Seq[Contact])
+
 case class AllDataTypes(
     stringField: String,
     intField: Int,
@@ -41,13 +64,32 @@ case class AllDataTypes(
     byteField: Byte,
     booleanField: Boolean)
 
+case class AllDataTypesWithNonPrimitiveType(
+    stringField: String,
+    intField: Int,
+    longField: Long,
+    floatField: Float,
+    doubleField: Double,
+    shortField: Short,
+    byteField: Byte,
+    booleanField: Boolean,
+    array: Seq[Int],
+    arrayContainsNull: Seq[Option[Int]],
+    map: Map[Int, Long],
+    mapValueContainsNull: Map[Int, Option[Long]],
+    data: Data)
+
+case class BinaryData(binaryData: Array[Byte])
+
 class OrcQuerySuite extends QueryTest with BeforeAndAfterAll {
 
   test("Read/Write All Types") {
     val tempDir = getTempFilePath("orcTest").getCanonicalPath
     val range = (0 to 255)
-    val data = sparkContext.parallelize(range)
-      .map(x => AllDataTypes(s"$x", x, x.toLong, x.toFloat, x.toDouble, x.toShort, x.toByte, x % 2 == 0))
+    val data = sparkContext
+      .parallelize(range)
+      .map(x => AllDataTypes(
+      s"$x", x, x.toLong, x.toFloat, x.toDouble, x.toShort, x.toByte, x % 2 == 0))
 
     data.saveAsOrcFile(tempDir)
     checkAnswer(
@@ -57,6 +99,79 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll {
     Utils.deleteRecursively(new File(tempDir))
   }
 
+  test("read/write binary data") {
+    val tempDir = getTempFilePath("orcTest").getCanonicalPath
+    sparkContext.parallelize(BinaryData("test".getBytes("utf8")) :: Nil).saveAsOrcFile(tempDir)
+    TestHive.orcFile(tempDir)
+      .map(r => new String(r(0).asInstanceOf[Array[Byte]], "utf8"))
+      .collect().toSeq == Seq("test")
+    Utils.deleteRecursively(new File(tempDir))
+  }
+
+  test("Read/Write All Types with non-primitive type") {
+    val tempDir = getTempFilePath("orcTest").getCanonicalPath
+    val range = (0 to 255)
+    val data = sparkContext.parallelize(range)
+      .map(x => AllDataTypesWithNonPrimitiveType(
+      s"$x", x, x.toLong, x.toFloat, x.toDouble, x.toShort, x.toByte, x % 2 == 0,
+      (0 until x),
+      (0 until x).map(Option(_).filter(_ % 3 == 0)),
+      (0 until x).map(i => i -> i.toLong).toMap,
+      (0 until x).map(i => i -> Option(i.toLong)).toMap + (x -> None),
+      Data((0 until x), Nested(x, s"$x"))))
+    data.saveAsOrcFile(tempDir)
+
+    checkAnswer(
+      TestHive.orcFile(tempDir),
+      data.toSchemaRDD.collect().toSeq)
+    Utils.deleteRecursively(new File(tempDir))
+  }
+
+  test("Creating case class RDD table") {
+    sparkContext.parallelize((1 to 100))
+      .map(i => TestRDDEntry(i, s"val_$i"))
+      .registerTempTable("tmp")
+    val rdd = sql("SELECT * FROM tmp").collect().sortBy(_.getInt(0))
+    var counter = 1
+    rdd.foreach {
+      // '===' does not like string comparison?
+      row => {
+        assert(row.getString(1).equals(s"val_$counter"), s"row $counter value ${row.getString(1)} does not match val_$counter")
+        counter = counter + 1
+      }
+    }
+  }
+
+  test("Simple selection form orc table") {
+    val tempDir = getTempFilePath("orcTest").getCanonicalPath
+    val data = sparkContext.parallelize((1 to 10))
+      .map(i => Person(s"name_$i", i,  (0 until 2).map{ m=>
+      Contact(s"contact_$m", s"phone_$m") }))
+    data.saveAsOrcFile(tempDir)
+    val f = TestHive.orcFile(tempDir)
+    f.registerTempTable("tmp")
+    var rdd = sql("SELECT name FROM tmp where age <= 5")
+    assert(rdd.count() == 5)
+
+    rdd = sql("SELECT name, contacts FROM tmp where age > 5")
+    assert(rdd.count() == 5)
+
+    val contacts = rdd.flatMap(t=>t(1).asInstanceOf[Seq[_]])
+    assert(contacts.count() == 10)
+    Utils.deleteRecursively(new File(tempDir))
+  }
+
+  test("save and load case class RDD with Nones as orc") {
+    val data = OptionalReflectData(None, None, None, None, None)
+    val rdd = sparkContext.parallelize(data :: Nil)
+    val tempDir = getTempFilePath("orcTest").getCanonicalPath
+    rdd.saveAsOrcFile(tempDir)
+    val readFile = TestHive.orcFile(tempDir)
+    val rdd_saved = readFile.collect()
+    assert(rdd_saved(0) === Seq.fill(5)(null))
+    Utils.deleteRecursively(new File(tempDir))
+  }
+
   test("Compression options for writing to a Orcfile") {
     val tempDir = getTempFilePath("orcTest").getCanonicalPath
     val rdd = TestHive.sparkContext.parallelize((1 to 100))
@@ -82,4 +197,31 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll {
     assert(attributes(6).dataType == ByteType)
     assert(attributes(7).dataType == BooleanType)
   }
+
+  ignore("Other Compression options for writing to an Orcfile only supported in hive 0.13.1 and above") {
+    TestHive.sparkContext.hadoopConfiguration.set(orcDefaultCompressVar, "SNAPPY")
+    var tempDir = getTempFilePath("orcTest").getCanonicalPath
+    val rdd = sparkContext.parallelize((1 to 100))
+      .map(i => TestRDDEntry(i, s"val_$i"))
+    rdd.saveAsOrcFile(tempDir)
+    var actualCodec = OrcFileOperator.getMetaDataReader(new Path(tempDir), Some(new Configuration())).getCompression
+    assert(actualCodec == CompressionKind.SNAPPY)
+    Utils.deleteRecursively(new File(tempDir))
+
+    TestHive.sparkContext.hadoopConfiguration.set(orcDefaultCompressVar, "NONE")
+    tempDir = getTempFilePath("orcTest").getCanonicalPath
+    rdd.saveAsOrcFile(tempDir)
+    actualCodec = OrcFileOperator.getMetaDataReader(new Path(tempDir), Some(new Configuration())).getCompression
+    assert(actualCodec == CompressionKind.NONE)
+    Utils.deleteRecursively(new File(tempDir))
+
+    TestHive.sparkContext.hadoopConfiguration.set(orcDefaultCompressVar, "LZO")
+    tempDir = getTempFilePath("orcTest").getCanonicalPath
+    rdd.saveAsOrcFile(tempDir)
+    actualCodec = OrcFileOperator.getMetaDataReader(new Path(tempDir), Some(new Configuration())).getCompression
+    assert(actualCodec == CompressionKind.LZO)
+    Utils.deleteRecursively(new File(tempDir))
+  }
+
+
 }

From 37a21927b2c5a8e633fc4c727542f639e7a52569 Mon Sep 17 00:00:00 2001
From: scwf <wangfei1@huawei.com>
Date: Sat, 18 Oct 2014 03:37:33 -0700
Subject: [PATCH 17/32] tests suite fix

---
 .../sql/hive/orc/OrcTableOperations.scala     |  2 +-
 .../apache/spark/sql/hive/orc/package.scala   | 37 ++++++++++++++++++-
 .../spark/sql/hive/orc/ORCQuerySuite.scala    | 15 ++++----
 3 files changed, 45 insertions(+), 9 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala
index 0d6655f257602..f15a7c5a4a814 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala
@@ -202,7 +202,7 @@ private[sql] case class InsertIntoOrcTable(
       iter.map { row =>
         var i = 0
         while (i < row.length) {
-          outputData(i) = HadoopTableReader.unwrapData(row(i), fieldOIs(i))
+          outputData(i) = HadoopTypeConverter.wrap((row(i), fieldOIs(i)))
           i += 1
         }
         orcSerde.serialize(outputData, standardOI)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/package.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/package.scala
index cf2a3b87a3dfb..7998e6a60f079 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/package.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/package.scala
@@ -19,7 +19,12 @@ package org.apache.spark.sql.hive
 
 import org.apache.spark.sql.{SQLContext, SchemaRDD}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-
+import org.apache.hadoop.hive.serde2.objectinspector._
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.{JavaHiveDecimalObjectInspector, JavaHiveVarcharObjectInspector}
+import org.apache.hadoop.hive.common.`type`.{HiveDecimal, HiveVarchar}
+import org.apache.spark.sql.catalyst.expressions.Row
+/* Implicit conversions */
+import scala.collection.JavaConversions._
 package object orc {
   class OrcSchemaRDD(
        @transient val sqlContext1: SQLContext,
@@ -38,6 +43,36 @@ package object orc {
     }
   }
 
+  object HadoopTypeConverter extends HiveInspectors {
+    def wrap(a: (Any, ObjectInspector)): Any = a match {
+      case (s: String, oi: JavaHiveVarcharObjectInspector) =>
+        new HiveVarchar(s, s.size)
+
+      case (bd: BigDecimal, oi: JavaHiveDecimalObjectInspector) =>
+        new HiveDecimal(bd.underlying())
+
+      case (row: Row, oi: StandardStructObjectInspector) =>
+        val struct = oi.create()
+        row.zip(oi.getAllStructFieldRefs: Seq[StructField]).foreach {
+          case (data, field) =>
+            oi.setStructFieldData(struct, field, wrap(data, field.getFieldObjectInspector))
+        }
+        struct
+      case (s: Seq[_], oi: ListObjectInspector) =>
+        val wrappedSeq = s.map(wrap(_, oi.getListElementObjectInspector))
+        seqAsJavaList(wrappedSeq)
+
+      case (m: Map[_, _], oi: MapObjectInspector) =>
+        val keyOi = oi.getMapKeyObjectInspector
+        val valueOi = oi.getMapValueObjectInspector
+        val wrappedMap = m.map { case (key, value) => wrap(key, keyOi) -> wrap(value, valueOi) }
+        mapAsJavaMap(wrappedMap)
+
+      case (obj, _) =>
+        obj
+    }
+  }
+
   // for orc compression type, only take effect in hive 0.13.1
   val orcDefaultCompressVar = "hive.exec.orc.default.compress"
   // for prediction push down in hive-0.13.1, don't enable it
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/ORCQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/ORCQuerySuite.scala
index c439ef532d0f5..791d7c01066de 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/ORCQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/ORCQuerySuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.hive.orc
 
 import java.util.Properties
+import java.io.File
 import org.scalatest.BeforeAndAfterAll
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
@@ -27,7 +28,6 @@ import org.apache.spark.sql.catalyst.util.getTempFilePath
 import org.apache.spark.sql.hive.test.TestHive._
 import org.apache.spark.sql.catalyst.types._
 import org.apache.spark.util.Utils
-import java.io.File
 import org.apache.hadoop.hive.ql.io.orc.CompressionKind
 
 case class TestRDDEntry(key: Int, value: String)
@@ -101,10 +101,13 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll {
 
   test("read/write binary data") {
     val tempDir = getTempFilePath("orcTest").getCanonicalPath
-    sparkContext.parallelize(BinaryData("test".getBytes("utf8")) :: Nil).saveAsOrcFile(tempDir)
+    val range = (0 to 3)
+    sparkContext.parallelize(range)
+    .map(x => BinaryData(s"test$x".getBytes("utf8"))).saveAsOrcFile(tempDir)
+
     TestHive.orcFile(tempDir)
       .map(r => new String(r(0).asInstanceOf[Array[Byte]], "utf8"))
-      .collect().toSeq == Seq("test")
+      .collect().toSet == Set("test0", "test1", "test2", "test3")
     Utils.deleteRecursively(new File(tempDir))
   }
 
@@ -163,7 +166,7 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll {
 
   test("save and load case class RDD with Nones as orc") {
     val data = OptionalReflectData(None, None, None, None, None)
-    val rdd = sparkContext.parallelize(data :: Nil)
+    val rdd = sparkContext.parallelize(data :: data :: data :: Nil)
     val tempDir = getTempFilePath("orcTest").getCanonicalPath
     rdd.saveAsOrcFile(tempDir)
     val readFile = TestHive.orcFile(tempDir)
@@ -186,7 +189,7 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll {
   }
 
   test("Get ORC Schema with ORC Reader") {
-    val path = "sql/hive/src/test/resources/data/files/orcfiles"
+    val path = "src/test/resources/data/files/orcfiles"
     val attributes = OrcFileOperator.orcSchema(path, Some(TestHive.sparkContext.hadoopConfiguration), new Properties())
     assert(attributes(0).dataType == StringType)
     assert(attributes(1).dataType == IntegerType)
@@ -222,6 +225,4 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll {
     assert(actualCodec == CompressionKind.LZO)
     Utils.deleteRecursively(new File(tempDir))
   }
-
-
 }

From 4e1c839960c6ca25ec16d88a750f75a28a25d4bf Mon Sep 17 00:00:00 2001
From: scwf <wangfei1@huawei.com>
Date: Sat, 18 Oct 2014 21:35:57 -0700
Subject: [PATCH 18/32] minor fix

---
 .../src/main/scala/org/apache/spark/sql/hive/HiveContext.scala  | 2 +-
 .../org/apache/spark/sql/hive/orc/OrcTableOperations.scala      | 2 +-
 .../src/main/scala/org/apache/spark/sql/hive/orc/package.scala  | 1 +
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index 0f7d5a7fd92bc..8d6530d308456 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -23,7 +23,7 @@ import java.util.{ArrayList => JArrayList}
 
 import scala.collection.JavaConversions._
 import scala.language.implicitConversions
-import scala.reflect.runtime.universe.TypeTag
+import scala.reflect.runtime.universe.{TypeTag, typeTag}
 import scala.Some
 
 import org.apache.hadoop.fs.FileSystem
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala
index f15a7c5a4a814..82aeb12a55a8b 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala
@@ -132,7 +132,7 @@ case class OrcTableScan(
    * @return Pruned TableScan.
    */
   def pruneColumns(prunedAttributes: Seq[Attribute]): OrcTableScan = {
-    // Todo: prune projection
+    // TODO: prune projection
     OrcTableScan(prunedAttributes, relation, columnPruningPred)
   }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/package.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/package.scala
index 7998e6a60f079..14fe51b92d4b1 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/package.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/package.scala
@@ -43,6 +43,7 @@ package object orc {
     }
   }
 
+  // TypeConverter for InsertIntoOrcTable
   object HadoopTypeConverter extends HiveInspectors {
     def wrap(a: (Any, ObjectInspector)): Any = a match {
       case (s: String, oi: JavaHiveVarcharObjectInspector) =>

From b8495d57aed62be6ae4d793afc3c652657167e17 Mon Sep 17 00:00:00 2001
From: wangfei <wangfei1@huawei.com>
Date: Sun, 19 Oct 2014 10:53:35 -0700
Subject: [PATCH 19/32] add test case for reading/righting empty RDD

---
 .../spark/sql/hive/orc/ORCQuerySuite.scala       | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/ORCQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/ORCQuerySuite.scala
index 791d7c01066de..0430fcd19a219 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/ORCQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/ORCQuerySuite.scala
@@ -111,6 +111,22 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll {
     Utils.deleteRecursively(new File(tempDir))
   }
 
+  test("read/right empty RDD") {
+    val tempDir = getTempFilePath("orcTest").getCanonicalPath
+    val range = (0 to 255)
+    val data = sparkContext
+      .parallelize(range)
+      .map(x => AllDataTypes(
+      s"$x", x, x.toLong, x.toFloat, x.toDouble, x.toShort, x.toByte, x % 2 == 0))
+      .filter(_.intField > 500)
+    data.saveAsOrcFile(tempDir)
+    checkAnswer(
+      TestHive.orcFile(tempDir),
+      data.toSchemaRDD.collect().toSeq)
+
+    Utils.deleteRecursively(new File(tempDir))
+  }
+
   test("Read/Write All Types with non-primitive type") {
     val tempDir = getTempFilePath("orcTest").getCanonicalPath
     val range = (0 to 255)

From f680da07742605e6a38bf4132477e063b2b22548 Mon Sep 17 00:00:00 2001
From: wangfei <wangfei1@huawei.com>
Date: Sun, 19 Oct 2014 11:23:21 -0700
Subject: [PATCH 20/32] no use method

---
 .../spark/sql/hive/orc/OrcTableOperations.scala       | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala
index 82aeb12a55a8b..7931425675128 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala
@@ -124,17 +124,6 @@ case class OrcTableScan(
       ColumnProjectionUtils.appendReadColumnNames(conf, names)
     }
   }
-
-  /**
-   * Applies a (candidate) projection.
-   *
-   * @param prunedAttributes The list of attributes to be used in the projection.
-   * @return Pruned TableScan.
-   */
-  def pruneColumns(prunedAttributes: Seq[Attribute]): OrcTableScan = {
-    // TODO: prune projection
-    OrcTableScan(prunedAttributes, relation, columnPruningPred)
-  }
 }
 
 /**

From 269b15f40fceb9c628f9c9d57cdf042e05e07ac9 Mon Sep 17 00:00:00 2001
From: Zhan Zhang <zhazhan@gmail.com>
Date: Wed, 22 Oct 2014 15:51:20 -0700
Subject: [PATCH 21/32] make wrap consistent with
 InsertIntoHiveTable.wrapperFor

---
 .../sql/hive/orc/OrcTableOperations.scala     |  3 +-
 .../apache/spark/sql/hive/orc/package.scala   | 46 ++++++++++---------
 2 files changed, 27 insertions(+), 22 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala
index 7931425675128..4b2a8511e8e49 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala
@@ -188,10 +188,11 @@ private[sql] case class InsertIntoOrcTable(
       val fieldOIs = standardOI
         .getAllStructFieldRefs.map(_.getFieldObjectInspector).toArray
       val outputData = new Array[Any](fieldOIs.length)
+      val wrappers = fieldOIs.map(HadoopTypeConverter.wrapperFor)
       iter.map { row =>
         var i = 0
         while (i < row.length) {
-          outputData(i) = HadoopTypeConverter.wrap((row(i), fieldOIs(i)))
+          outputData(i) = wrappers(i)(row(i))
           i += 1
         }
         orcSerde.serialize(outputData, standardOI)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/package.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/package.scala
index 14fe51b92d4b1..81eea055e8b60 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/package.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/package.scala
@@ -45,32 +45,36 @@ package object orc {
 
   // TypeConverter for InsertIntoOrcTable
   object HadoopTypeConverter extends HiveInspectors {
-    def wrap(a: (Any, ObjectInspector)): Any = a match {
-      case (s: String, oi: JavaHiveVarcharObjectInspector) =>
-        new HiveVarchar(s, s.size)
+    def wrapperFor(oi: ObjectInspector): Any => Any = oi match {
+      case _: JavaHiveVarcharObjectInspector =>
+        (o: Any) => new HiveVarchar(o.asInstanceOf[String], o.asInstanceOf[String].size)
 
-      case (bd: BigDecimal, oi: JavaHiveDecimalObjectInspector) =>
-        new HiveDecimal(bd.underlying())
+      case _: JavaHiveDecimalObjectInspector =>
+        (o: Any) => new HiveDecimal(o.asInstanceOf[BigDecimal].underlying())
 
-      case (row: Row, oi: StandardStructObjectInspector) =>
-        val struct = oi.create()
-        row.zip(oi.getAllStructFieldRefs: Seq[StructField]).foreach {
-          case (data, field) =>
-            oi.setStructFieldData(struct, field, wrap(data, field.getFieldObjectInspector))
+      case soi: StandardStructObjectInspector =>
+        val wrappers = soi.getAllStructFieldRefs.map(ref => wrapperFor(ref.getFieldObjectInspector))
+        (o: Any) => {
+          val struct = soi.create()
+          (soi.getAllStructFieldRefs, wrappers, o.asInstanceOf[Row]).zipped.foreach {
+            (field, wrapper, data) => soi.setStructFieldData(struct, field, wrapper(data))
+          }
+          struct
         }
-        struct
-      case (s: Seq[_], oi: ListObjectInspector) =>
-        val wrappedSeq = s.map(wrap(_, oi.getListElementObjectInspector))
-        seqAsJavaList(wrappedSeq)
 
-      case (m: Map[_, _], oi: MapObjectInspector) =>
-        val keyOi = oi.getMapKeyObjectInspector
-        val valueOi = oi.getMapValueObjectInspector
-        val wrappedMap = m.map { case (key, value) => wrap(key, keyOi) -> wrap(value, valueOi) }
-        mapAsJavaMap(wrappedMap)
+      case loi: ListObjectInspector =>
+        val wrapper = wrapperFor(loi.getListElementObjectInspector)
+        (o: Any) => seqAsJavaList(o.asInstanceOf[Seq[_]].map(wrapper))
 
-      case (obj, _) =>
-        obj
+      case moi: MapObjectInspector =>
+        val keyWrapper = wrapperFor(moi.getMapKeyObjectInspector)
+        val valueWrapper = wrapperFor(moi.getMapValueObjectInspector)
+        (o: Any) => mapAsJavaMap(o.asInstanceOf[Map[_, _]].map { case (key, value) =>
+          keyWrapper(key) -> valueWrapper(value)
+        })
+
+      case _ =>
+        identity[Any]
     }
   }
 

From 9bd3c859729887eb0787cf5943ea15f868c4766d Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Fri, 14 Nov 2014 12:36:26 -0800
Subject: [PATCH 22/32] First draft.

---
 .../apache/spark/sql/parquet/NewParquet.scala | 147 ++++++++++++++++++
 .../sql/parquet/ParquetTableOperations.scala  |  30 +++-
 2 files changed, 175 insertions(+), 2 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/parquet/NewParquet.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/NewParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/NewParquet.scala
new file mode 100644
index 0000000000000..a08326a698f2b
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/NewParquet.scala
@@ -0,0 +1,147 @@
+package org.apache.spark.sql.parquet
+
+import org.apache.hadoop.fs.{FileSystem, Path}
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.mapreduce.Job
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
+import org.apache.spark.Logging
+import org.apache.spark.rdd.RDD
+
+import org.apache.spark.sql.{SQLConf, Row, SQLContext}
+import org.apache.spark.sql.catalyst.types.StructType
+import org.apache.spark.sql.sources._
+import parquet.hadoop.ParquetInputFormat
+import parquet.hadoop.util.ContextUtil
+
+class DefaultSource extends RelationProvider {
+  /** Returns a new base relation with the given parameters. */
+  override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = {
+    ParquetRelation2(parameters("path"))(sqlContext)
+  }
+}
+
+case class Partition(partitioningKeys: Map[String, Any], files: Seq[String])
+
+case class ParquetRelation2(
+    path: String)(
+    @transient val sqlContext: SQLContext) extends PrunedFilteredScan with Logging {
+
+  def sparkContext = sqlContext.sparkContext
+
+  val (partitionKeys: Seq[String], partitions: Seq[Partition]) = {
+    val fs = FileSystem.get(new java.net.URI(path), sparkContext.hadoopConfiguration)
+    val partValue = "([^=]+)=([^=]+)".r
+
+    val folders = fs.listStatus(new Path(path)).filter(s => s.isDir).map(_.getPath.getName).map {
+      case partValue(key, value) => (key, value)
+    }
+
+    if (folders.size > 0) {
+      val partitionKeys = folders.map(_._1).distinct
+      if (partitionKeys.size > 1) {
+        sys.error(s"Too many distinct partition keys: $partitionKeys")
+      }
+
+      (partitionKeys.toSeq, folders.map { case (key, value) =>
+        val files = s"$path/$key=$value" :: Nil
+        Partition(Map(key -> value.toInt), files)
+      }.toSeq)
+
+    } else {
+      (Nil, Partition(Map.empty, path :: Nil) :: Nil)
+    }
+  }
+
+  val schema =
+    StructType.fromAttributes(
+      ParquetTypesConverter.readSchemaFromFile(
+        new Path(partitions.head.files.head),
+        None,  // TODO
+        sqlContext.isParquetBinaryAsString))
+
+  override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = {
+    val job = new Job(sparkContext.hadoopConfiguration)
+    ParquetInputFormat.setReadSupportClass(job, classOf[RowReadSupport])
+    val conf: Configuration = ContextUtil.getConfiguration(job)
+
+    val requestedSchema = StructType(requiredColumns.map(schema(_)))
+
+    val partitionFilters = filters.collect {
+      case e @ EqualTo(attr, value) if partitionKeys.contains(attr) =>
+        logInfo(s"Parquet scan partition filter: $attr=$value")
+        (p: Partition) => p.partitioningKeys(attr) == value
+
+      case e @ In(attr, values) if partitionKeys.contains(attr) =>
+        logInfo(s"Parquet scan partition filter: $attr IN ${values.mkString("{", ",", "}")}")
+        val set = values.toSet
+        (p: Partition) => set.contains(p.partitioningKeys(attr))
+
+      case e @ GreaterThan(attr, value) if partitionKeys.contains(attr) =>
+        logInfo(s"Parquet scan partition filter: $attr > $value")
+        (p: Partition) => p.partitioningKeys(attr).asInstanceOf[Int] > value.asInstanceOf[Int]
+
+      case e @ GreaterThanOrEqual(attr, value) if partitionKeys.contains(attr) =>
+        logInfo(s"Parquet scan partition filter: $attr >= $value")
+        (p: Partition) => p.partitioningKeys(attr).asInstanceOf[Int] >= value.asInstanceOf[Int]
+
+      case e @ LessThan(attr, value) if partitionKeys.contains(attr) =>
+        logInfo(s"Parquet scan partition filter: $attr < $value")
+        (p: Partition) => p.partitioningKeys(attr).asInstanceOf[Int] < value.asInstanceOf[Int]
+
+      case e @ LessThanOrEqual(attr, value) if partitionKeys.contains(attr) =>
+        logInfo(s"Parquet scan partition filter: $attr <= $value")
+        (p: Partition) => p.partitioningKeys(attr).asInstanceOf[Int] <= value.asInstanceOf[Int]
+    }
+
+    val selectedPartitions = partitions.filter(p => partitionFilters.forall(_(p)))
+
+    selectedPartitions.flatMap(_.files).foreach { curPath =>
+      val qualifiedPath = {
+        val path = new Path(curPath)
+        path.getFileSystem(conf).makeQualified(path)
+      }
+      org.apache.hadoop.mapreduce.lib.input.FileInputFormat.addInputPath(job, qualifiedPath)
+    }
+
+    def fractionRead = selectedPartitions.size.toDouble / partitions.size.toDouble * 100
+    logInfo(s"Reading $fractionRead% of $path partitions")
+
+    // Store both requested and original schema in `Configuration`
+    conf.set(
+      RowReadSupport.SPARK_ROW_REQUESTED_SCHEMA,
+      ParquetTypesConverter.convertToString(requestedSchema.toAttributes))
+    conf.set(
+      RowWriteSupport.SPARK_ROW_SCHEMA,
+      ParquetTypesConverter.convertToString(schema.toAttributes))
+
+    // Tell FilteringParquetRowInputFormat whether it's okay to cache Parquet and FS metadata
+    conf.set(
+      SQLConf.PARQUET_CACHE_METADATA,
+      sqlContext.getConf(SQLConf.PARQUET_CACHE_METADATA, "true"))
+
+    val baseRDD =
+      new org.apache.spark.rdd.NewHadoopRDD(
+        sparkContext,
+        classOf[FilteringParquetRowInputFormat],
+        classOf[Void],
+        classOf[Row],
+        conf)
+
+    baseRDD.map(_._2)
+  }
+
+  override val sizeInBytes = {
+    def calculateTableSize(fs: FileSystem, path: Path): Long = {
+      val fileStatus = fs.getFileStatus(path)
+      val size = if (fileStatus.isDir) {
+        fs.listStatus(path).map(status => calculateTableSize(fs, status.getPath)).sum
+      } else {
+        fileStatus.getLen
+      }
+
+      size
+    }
+    val fs = FileSystem.get(new java.net.URI(path), sparkContext.hadoopConfiguration)
+    calculateTableSize(fs, new Path(path))
+  }
+}
\ No newline at end of file
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
index 74c43e053b03c..63f2456e8c537 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
@@ -29,7 +29,7 @@ import scala.util.Try
 
 import com.google.common.cache.CacheBuilder
 import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.{BlockLocation, FileStatus, Path}
+import org.apache.hadoop.fs.{FileSystem, BlockLocation, FileStatus, Path}
 import org.apache.hadoop.mapreduce._
 import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat => NewFileInputFormat}
 import org.apache.hadoop.mapreduce.lib.output.{FileOutputFormat => NewFileOutputFormat}
@@ -369,7 +369,7 @@ private[parquet] class FilteringParquetRowInputFormat
 
   private var footers: JList[Footer] = _
 
-  private var fileStatuses= Map.empty[Path, FileStatus]
+  private var fileStatuses = Map.empty[Path, FileStatus]
 
   override def createRecordReader(
       inputSplit: InputSplit,
@@ -390,6 +390,26 @@ private[parquet] class FilteringParquetRowInputFormat
     }
   }
 
+  override def listStatus(jobContext: JobContext): JList[FileStatus] = {
+    val conf = ContextUtil.getConfiguration(jobContext)
+    val paths = NewFileInputFormat.getInputPaths(jobContext)
+    if (paths.isEmpty) { return new ArrayList[FileStatus]() }
+    val fs = paths.head.getFileSystem(conf)
+    val statuses = new ArrayList[FileStatus]
+    paths.foreach { p =>
+      val cached = FilteringParquetRowInputFormat.statusCache.getIfPresent(p)
+      if (cached == null) {
+        logWarning(s"Status cache miss for $p")
+        val res = fs.listStatus(p).filterNot(_.getPath.getName.startsWith("_"))
+        res.foreach(statuses.add)
+        FilteringParquetRowInputFormat.statusCache.put(p, res)
+      } else {
+        cached.foreach(statuses.add)
+      }
+    }
+    statuses
+  }
+
   override def getFooters(jobContext: JobContext): JList[Footer] = {
     import FilteringParquetRowInputFormat.footerCache
 
@@ -414,7 +434,9 @@ private[parquet] class FilteringParquetRowInputFormat
         }
         val newFooters = new mutable.HashMap[FileStatus, Footer]
         if (toFetch.size > 0) {
+          val startFetch = System.currentTimeMillis
           val fetched = getFooters(conf, toFetch)
+          logInfo(s"Fetched $toFetch footers in ${System.currentTimeMillis - startFetch} ms")
           for ((status, i) <- toFetch.zipWithIndex) {
             newFooters(status) = fetched.get(i)
           }
@@ -598,6 +620,10 @@ private[parquet] class FilteringParquetRowInputFormat
 }
 
 private[parquet] object FilteringParquetRowInputFormat {
+  val statusCache = CacheBuilder.newBuilder()
+    .maximumSize(20000)
+    .build[Path, Array[FileStatus]]()
+
   private val footerCache = CacheBuilder.newBuilder()
     .maximumSize(20000)
     .build[FileStatus, Footer]()

From 5f5fda8244c59970d117a0e9d32bd8bddde348f2 Mon Sep 17 00:00:00 2001
From: w00228970 <wangfei1@huawei.com>
Date: Mon, 17 Nov 2014 10:34:17 +0800
Subject: [PATCH 23/32] https -> http in pom

---
 pom.xml | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/pom.xml b/pom.xml
index 639ea22a1fda3..26fa0e823edbf 100644
--- a/pom.xml
+++ b/pom.xml
@@ -156,7 +156,7 @@
       <id>central</id>
       <!-- This should be at top, it makes maven try the central repo first and then others and hence faster dep resolution -->
       <name>Maven Repository</name>
-      <url>https://repo1.maven.org/maven2</url>
+      <url>http://repo1.maven.org/maven2</url>
       <releases>
         <enabled>true</enabled>
       </releases>
@@ -167,7 +167,7 @@
     <repository>
       <id>apache-repo</id>
       <name>Apache Repository</name>
-      <url>https://repository.apache.org/content/repositories/releases</url>
+      <url>http://repository.apache.org/content/repositories/releases</url>
       <releases>
         <enabled>true</enabled>
       </releases>
@@ -178,7 +178,7 @@
     <repository>
       <id>jboss-repo</id>
       <name>JBoss Repository</name>
-      <url>https://repository.jboss.org/nexus/content/repositories/releases</url>
+      <url>http://repository.jboss.org/nexus/content/repositories/releases</url>
       <releases>
         <enabled>true</enabled>
       </releases>
@@ -189,7 +189,7 @@
     <repository>
       <id>mqtt-repo</id>
       <name>MQTT Repository</name>
-      <url>https://repo.eclipse.org/content/repositories/paho-releases</url>
+      <url>http://repo.eclipse.org/content/repositories/paho-releases</url>
       <releases>
         <enabled>true</enabled>
       </releases>
@@ -200,7 +200,7 @@
     <repository>
       <id>cloudera-repo</id>
       <name>Cloudera Repository</name>
-      <url>https://repository.cloudera.com/artifactory/cloudera-repos</url>
+      <url>http://repository.cloudera.com/artifactory/cloudera-repos</url>
       <releases>
         <enabled>true</enabled>
       </releases>
@@ -222,7 +222,7 @@
     <repository>
       <id>spring-releases</id>
       <name>Spring Release Repository</name>
-      <url>https://repo.spring.io/libs-release</url>
+      <url>http://repo.spring.io/libs-release</url>
       <releases>
         <enabled>true</enabled>
       </releases>
@@ -234,7 +234,7 @@
       <!-- This is temporarily included to fix issues with Hive 0.12 -->
       <id>spark-staging</id>
       <name>Spring Staging Repository</name>
-      <url>https://oss.sonatype.org/content/repositories/orgspark-project-1085</url>
+      <url>http://oss.sonatype.org/content/repositories/orgspark-project-1085</url>
       <releases>
         <enabled>true</enabled>
       </releases>
@@ -246,7 +246,7 @@
       <!-- This is temporarily included to fix issues with Hive 0.13 -->
       <id>spark-staging-hive13</id>
       <name>Spring Staging Repository Hive 13</name>
-      <url>https://oss.sonatype.org/content/repositories/orgspark-project-1089/</url>
+      <url>http://oss.sonatype.org/content/repositories/orgspark-project-1089/</url>
       <releases>
         <enabled>true</enabled>
       </releases>
@@ -258,7 +258,7 @@
   <pluginRepositories>
     <pluginRepository>
       <id>central</id>
-      <url>https://repo1.maven.org/maven2</url>
+      <url>http://repo1.maven.org/maven2</url>
       <releases>
         <enabled>true</enabled>
       </releases>

From 5d7f863083e554db94f042d4ecf3112081950684 Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Tue, 18 Nov 2014 14:30:27 -0800
Subject: [PATCH 24/32] logging / formatting improvements.

---
 .../org/apache/spark/sql/parquet/ParquetTableOperations.scala | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
index 5d0643a64a044..0e36852ddd9b0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
@@ -361,7 +361,7 @@ private[parquet] class FilteringParquetRowInputFormat
 
   private var footers: JList[Footer] = _
 
-  private var fileStatuses= Map.empty[Path, FileStatus]
+  private var fileStatuses = Map.empty[Path, FileStatus]
 
   override def createRecordReader(
       inputSplit: InputSplit,
@@ -405,7 +405,9 @@ private[parquet] class FilteringParquetRowInputFormat
         }
         val newFooters = new mutable.HashMap[FileStatus, Footer]
         if (toFetch.size > 0) {
+          val startFetch = System.currentTimeMillis
           val fetched = getFooters(conf, toFetch)
+          logInfo(s"Fetched $toFetch footers in ${System.currentTimeMillis - startFetch} ms")
           for ((status, i) <- toFetch.zipWithIndex) {
             newFooters(status) = fetched.get(i)
           }

From 94e0d40496ead35e4baab5133baffe7b0b71b83b Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Tue, 18 Nov 2014 14:30:55 -0800
Subject: [PATCH 25/32] Add an experimental interface to data sources that
 exposes catalyst expressions.

---
 .../sql/sources/DataSourceStrategy.scala      | 43 ++++++++++++++-----
 .../apache/spark/sql/sources/interfaces.scala | 22 +++++++++-
 2 files changed, 53 insertions(+), 12 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
index 954e86822de17..37853d4d03019 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
@@ -31,6 +31,13 @@ import org.apache.spark.sql.execution.SparkPlan
  */
 private[sql] object DataSourceStrategy extends Strategy {
   def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
+    case PhysicalOperation(projectList, filters, l @ LogicalRelation(t: CatalystScan)) =>
+      pruneFilterProjectRaw(
+        l,
+        projectList,
+        filters,
+        (a, f) => t.buildScan(a, f)) :: Nil
+
     case PhysicalOperation(projectList, filters, l @ LogicalRelation(t: PrunedFilteredScan)) =>
       pruneFilterProject(
         l,
@@ -51,19 +58,35 @@ private[sql] object DataSourceStrategy extends Strategy {
     case _ => Nil
   }
 
+  // Based on Public API.
   protected def pruneFilterProject(
-    relation: LogicalRelation,
-    projectList: Seq[NamedExpression],
-    filterPredicates: Seq[Expression],
-    scanBuilder: (Array[String], Array[Filter]) => RDD[Row]) = {
+      relation: LogicalRelation,
+      projectList: Seq[NamedExpression],
+      filterPredicates: Seq[Expression],
+      scanBuilder: (Array[String], Array[Filter]) => RDD[Row]) = {
+    pruneFilterProjectRaw(
+      relation,
+      projectList,
+      filterPredicates,
+      (requestedColumns, pushedFilters) => {
+        scanBuilder(requestedColumns.map(_.name).toArray, selectFilters(pushedFilters).toArray)
+      })
+  }
+
+  // Based on Catalyst expressions.
+  protected def pruneFilterProjectRaw(
+      relation: LogicalRelation,
+      projectList: Seq[NamedExpression],
+      filterPredicates: Seq[Expression],
+      scanBuilder: (Seq[Attribute], Seq[Expression]) => RDD[Row]) = {
 
     val projectSet = AttributeSet(projectList.flatMap(_.references))
     val filterSet = AttributeSet(filterPredicates.flatMap(_.references))
     val filterCondition = filterPredicates.reduceLeftOption(And)
 
-    val pushedFilters = selectFilters(filterPredicates.map { _ transform {
+    val pushedFilters = filterPredicates.map { _ transform {
       case a: AttributeReference => relation.attributeMap(a) // Match original case of attributes.
-    }}).toArray
+    }}
 
     if (projectList.map(_.toAttribute) == projectList &&
         projectSet.size == projectList.size &&
@@ -74,8 +97,6 @@ private[sql] object DataSourceStrategy extends Strategy {
       val requestedColumns =
         projectList.asInstanceOf[Seq[Attribute]] // Safe due to if above.
           .map(relation.attributeMap)            // Match original case of attributes.
-          .map(_.name)
-          .toArray
 
       val scan =
         execution.PhysicalRDD(
@@ -84,14 +105,14 @@ private[sql] object DataSourceStrategy extends Strategy {
       filterCondition.map(execution.Filter(_, scan)).getOrElse(scan)
     } else {
       val requestedColumns = (projectSet ++ filterSet).map(relation.attributeMap).toSeq
-      val columnNames = requestedColumns.map(_.name).toArray
 
-      val scan = execution.PhysicalRDD(requestedColumns, scanBuilder(columnNames, pushedFilters))
+      val scan =
+        execution.PhysicalRDD(requestedColumns, scanBuilder(requestedColumns, pushedFilters))
       execution.Project(projectList, filterCondition.map(execution.Filter(_, scan)).getOrElse(scan))
     }
   }
 
-  protected def selectFilters(filters: Seq[Expression]): Seq[Filter] = filters.collect {
+  protected[sql] def selectFilters(filters: Seq[Expression]): Seq[Filter] = filters.collect {
     case expressions.EqualTo(a: Attribute, Literal(v, _)) => EqualTo(a.name, v)
     case expressions.EqualTo(Literal(v, _), a: Attribute) => EqualTo(a.name, v)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
index 861638b1e99b6..2b8fc05fc0102 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -16,12 +16,13 @@
 */
 package org.apache.spark.sql.sources
 
-import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.annotation.{Experimental, DeveloperApi}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{SQLConf, Row, SQLContext, StructType}
 import org.apache.spark.sql.catalyst.expressions.{Expression, Attribute}
 
 /**
+ * ::DeveloperApi::
  * Implemented by objects that produce relations for a specific kind of data source.  When
  * Spark SQL is given a DDL operation with a USING clause specified, this interface is used to
  * pass in the parameters specified by a user.
@@ -40,6 +41,7 @@ trait RelationProvider {
 }
 
 /**
+ * ::DeveloperApi::
  * Represents a collection of tuples with a known schema.  Classes that extend BaseRelation must
  * be able to produce the schema of their data in the form of a [[StructType]]  Concrete
  * implementation should inherit from one of the descendant `Scan` classes, which define various
@@ -65,6 +67,7 @@ abstract class BaseRelation {
 }
 
 /**
+ * ::DeveloperApi::
  * A BaseRelation that can produce all of its tuples as an RDD of Row objects.
  */
 @DeveloperApi
@@ -73,6 +76,7 @@ abstract class TableScan extends BaseRelation {
 }
 
 /**
+ * ::DeveloperApi::
  * A BaseRelation that can eliminate unneeded columns before producing an RDD
  * containing all of its tuples as Row objects.
  */
@@ -82,6 +86,7 @@ abstract class PrunedScan extends BaseRelation {
 }
 
 /**
+ * ::DeveloperApi::
  * A BaseRelation that can eliminate unneeded columns and filter using selected
  * predicates before producing an RDD containing all matching tuples as Row objects.
  *
@@ -93,3 +98,18 @@ abstract class PrunedScan extends BaseRelation {
 abstract class PrunedFilteredScan extends BaseRelation {
   def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row]
 }
+
+/**
+ * ::Experimental::
+ * An interface for experimenting with a more direct connection to the query planner.  Compared to
+ * [[PrunedFilteredScan]], this operator receives the raw expressions from the
+ * [[org.apache.spark.sql.catalyst.plans.logical.LogicalPlan]].  Unlike the other APIs this
+ * interface is not designed to be binary compatible across releases and thus should only be used
+ * for experimentation.
+ */
+@Experimental
+abstract class CatalystScan extends BaseRelation {
+  def buildScan(requiredColumns: Seq[Attribute], filters: Seq[Expression]): RDD[Row]
+}
+
+

From dd78aa7c4921408a27d9d9fb1a3cbb4867438d6b Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Tue, 18 Nov 2014 14:34:07 -0800
Subject: [PATCH 26/32] Alternative implementation of parquet based on the
 datasources API.

---
 .../apache/spark/sql/parquet/newParquet.scala | 291 ++++++++++++++++++
 ...tastoreSuite.scala => parquetSuites.scala} | 178 +++++++----
 2 files changed, 403 insertions(+), 66 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
 rename sql/hive/src/test/scala/org/apache/spark/sql/parquet/{ParquetMetastoreSuite.scala => parquetSuites.scala} (63%)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
new file mode 100644
index 0000000000000..c888a706c10cd
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -0,0 +1,291 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.parquet
+
+import java.util.{List => JList}
+
+import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
+import org.apache.hadoop.conf.{Configurable, Configuration}
+import org.apache.hadoop.io.Writable
+import org.apache.hadoop.mapreduce.{JobContext, InputSplit, Job}
+
+import parquet.hadoop.ParquetInputFormat
+import parquet.hadoop.util.ContextUtil
+
+import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.{Partition => SparkPartition, Logging}
+import org.apache.spark.rdd.{NewHadoopPartition, RDD}
+
+import org.apache.spark.sql.{SQLConf, Row, SQLContext}
+import org.apache.spark.sql.catalyst.expressions.{SpecificMutableRow, And, Expression, Attribute}
+import org.apache.spark.sql.catalyst.types.{IntegerType, StructField, StructType}
+import org.apache.spark.sql.sources._
+
+import scala.collection.JavaConversions._
+
+/**
+ * Allows creation of parquet based tables using the syntax
+ * `CREATE TABLE ... USING org.apache.spark.sql.parquet`.  Currently the only option required
+ * is `path`, which is should be the location of a collection of, optionally partitioned,
+ * parquet files.
+ */
+class DefaultSource extends RelationProvider {
+  /** Returns a new base relation with the given parameters. */
+  override def createRelation(
+      sqlContext: SQLContext,
+      parameters: Map[String, String]): BaseRelation = {
+    val path =
+      parameters.getOrElse("path", sys.error("'path' must be specifed for parquet tables."))
+
+    ParquetRelation2(path)(sqlContext)
+  }
+}
+
+private[parquet] case class Partition(partitionValues: Map[String, Any], files: Seq[FileStatus])
+
+/**
+ * An alternative to [[ParquetRelation]] that plugs in using the data sources API.  This class is
+ * currently not intended as a full replacement of the parquet support in Spark SQL though it is
+ * likely that it will eventually subsume the existing physical plan implementation.
+ *
+ * Compared with the current implementation, this class has the following notable differences:
+ *
+ * Partitioning: Partitions are auto discovered and must be in the form of directories `key=value/`
+ * located at `path`.  Currently only a single partitioning column is supported and it must
+ * be an integer.  This class supports both fully self-describing data, which contains the partition
+ * key, and data where the partition key is only present in the folder structure.  The presence
+ * of the partitioning key in the data is also auto-detected.
+ *
+ * Metadata: The metadata is automatically discovered by reading the first parquet file present.
+ * There is currently no support for working with files that have different schema.  Additionally,
+ * when parquet metadata caching is turned on, the FileStatus objects for all data will be cached
+ * to improve the speed of interactive querying.  When data is added to a table it must be dropped
+ * and recreated to pick up any changes.
+ *
+ * Statistics: Statistics for the size of the table are automatically populated during metadata
+ * discovery.
+ */
+@DeveloperApi
+case class ParquetRelation2(path: String)(@transient val sqlContext: SQLContext)
+  extends CatalystScan with Logging {
+
+  def sparkContext = sqlContext.sparkContext
+
+  // Minor Hack: scala doesnt seem to respect @transient for vals declared via extraction
+  @transient
+  private var partitionKeys: Seq[String] = _
+  @transient
+  private var partitions: Seq[Partition] = _
+  discoverPartitions()
+
+  // TODO: Only finds the first partition, assumes the key is of type Integer...
+  private def discoverPartitions() = {
+    val fs = FileSystem.get(new java.net.URI(path), sparkContext.hadoopConfiguration)
+    val partValue = "([^=]+)=([^=]+)".r
+
+    val childrenOfPath = fs.listStatus(new Path(path)).filterNot(_.getPath.getName.startsWith("_"))
+    val childDirs = childrenOfPath.filter(s => s.isDir)
+
+    if (childDirs.size > 0) {
+      val partitionPairs = childDirs.map(_.getPath.getName).map {
+        case partValue(key, value) => (key, value)
+      }
+
+      val foundKeys = partitionPairs.map(_._1).distinct
+      if (foundKeys.size > 1) {
+        sys.error(s"Too many distinct partition keys: $foundKeys")
+      }
+
+      // Do a parallel lookup of partition metadata.
+      val partitionFiles =
+        childDirs.par
+          .map(d => fs.listStatus(d.getPath).filterNot(_.getPath.getName.startsWith("_"))).seq
+
+      partitionKeys = foundKeys.toSeq
+      partitions = partitionFiles.zip(partitionPairs).map { case (files, (key, value)) =>
+        Partition(Map(key -> value.toInt), files)
+      }.toSeq
+    } else {
+      partitionKeys = Nil
+      partitions = Partition(Map.empty, childrenOfPath) :: Nil
+    }
+  }
+
+  override val sizeInBytes = partitions.flatMap(_.files).map(_.getLen).sum
+
+  val dataSchema = StructType.fromAttributes(// TODO: Parquet code should not deal with attributes.
+    ParquetTypesConverter.readSchemaFromFile(
+      partitions.head.files.head.getPath,
+      Some(sparkContext.hadoopConfiguration),
+      sqlContext.isParquetBinaryAsString))
+
+  val dataIncludesKey =
+    partitionKeys.headOption.map(dataSchema.fieldNames.contains(_)).getOrElse(true)
+
+  override val schema =
+    if (dataIncludesKey) {
+      dataSchema
+    } else {
+      StructType(
+        StructField(partitionKeys.head, IntegerType) +:
+        dataSchema.fields
+      )
+    }
+
+  override def buildScan(output: Seq[Attribute], predicates: Seq[Expression]): RDD[Row] = {
+    // This is mostly a hack so that we can use the existing parquet filter code.
+    val requiredColumns = output.map(_.name)
+    // TODO: Parquet filters should be based on data sources API, not catalyst expressions.
+    val filters = DataSourceStrategy.selectFilters(predicates)
+
+    val job = new Job(sparkContext.hadoopConfiguration)
+    ParquetInputFormat.setReadSupportClass(job, classOf[RowReadSupport])
+    val jobConf: Configuration = ContextUtil.getConfiguration(job)
+
+    val requestedSchema = StructType(requiredColumns.map(schema(_)))
+
+    // TODO: Make folder based partitioning a first class citizen of the Data Sources API.
+    val partitionFilters = filters.collect {
+      case e @ EqualTo(attr, value) if partitionKeys.contains(attr) =>
+        logInfo(s"Parquet scan partition filter: $attr=$value")
+        (p: Partition) => p.partitionValues(attr) == value
+
+      case e @ In(attr, values) if partitionKeys.contains(attr) =>
+        logInfo(s"Parquet scan partition filter: $attr IN ${values.mkString("{", ",", "}")}")
+        val set = values.toSet
+        (p: Partition) => set.contains(p.partitionValues(attr))
+
+      case e @ GreaterThan(attr, value) if partitionKeys.contains(attr) =>
+        logInfo(s"Parquet scan partition filter: $attr > $value")
+        (p: Partition) => p.partitionValues(attr).asInstanceOf[Int] > value.asInstanceOf[Int]
+
+      case e @ GreaterThanOrEqual(attr, value) if partitionKeys.contains(attr) =>
+        logInfo(s"Parquet scan partition filter: $attr >= $value")
+        (p: Partition) => p.partitionValues(attr).asInstanceOf[Int] >= value.asInstanceOf[Int]
+
+      case e @ LessThan(attr, value) if partitionKeys.contains(attr) =>
+        logInfo(s"Parquet scan partition filter: $attr < $value")
+        (p: Partition) => p.partitionValues(attr).asInstanceOf[Int] < value.asInstanceOf[Int]
+
+      case e @ LessThanOrEqual(attr, value) if partitionKeys.contains(attr) =>
+        logInfo(s"Parquet scan partition filter: $attr <= $value")
+        (p: Partition) => p.partitionValues(attr).asInstanceOf[Int] <= value.asInstanceOf[Int]
+    }
+
+    val selectedPartitions = partitions.filter(p => partitionFilters.forall(_(p)))
+
+    selectedPartitions.flatMap(_.files).foreach { status =>
+      val path = status.getPath
+      val qualifiedPath = path.getFileSystem(jobConf).makeQualified(path)
+      org.apache.hadoop.mapreduce.lib.input.FileInputFormat.addInputPath(job, qualifiedPath)
+    }
+
+    // Push down filters when possible
+    predicates
+      .reduceOption(And)
+      .flatMap(ParquetFilters.createFilter)
+      .filter(_ => sqlContext.parquetFilterPushDown)
+      .foreach(ParquetInputFormat.setFilterPredicate(jobConf, _))
+
+    def percentRead = selectedPartitions.size.toDouble / partitions.size.toDouble * 100
+    logInfo(s"Reading $percentRead% of $path partitions")
+
+    // Store both requested and original schema in `Configuration`
+    jobConf.set(
+      RowReadSupport.SPARK_ROW_REQUESTED_SCHEMA,
+      ParquetTypesConverter.convertToString(requestedSchema.toAttributes))
+    jobConf.set(
+      RowWriteSupport.SPARK_ROW_SCHEMA,
+      ParquetTypesConverter.convertToString(schema.toAttributes))
+
+    // Tell FilteringParquetRowInputFormat whether it's okay to cache Parquet and FS metadata
+    val useCache = sqlContext.getConf(SQLConf.PARQUET_CACHE_METADATA, "true").toBoolean
+    jobConf.set(SQLConf.PARQUET_CACHE_METADATA, useCache.toString)
+
+    val baseRDD =
+      new org.apache.spark.rdd.NewHadoopRDD(
+          sparkContext,
+          classOf[FilteringParquetRowInputFormat],
+          classOf[Void],
+          classOf[Row],
+          jobConf) {
+        val cacheMetadata = useCache
+
+        @transient
+        val cachedStatus = selectedPartitions.flatMap(_.files)
+
+        // Overridden so we can inject our own cached files statuses.
+        override def getPartitions: Array[SparkPartition] = {
+          val inputFormat =
+            if (cacheMetadata) {
+              new FilteringParquetRowInputFormat {
+                override def listStatus(jobContext: JobContext): JList[FileStatus] = cachedStatus
+              }
+            } else {
+              new FilteringParquetRowInputFormat
+            }
+
+          inputFormat match {
+            case configurable: Configurable =>
+              configurable.setConf(getConf)
+            case _ =>
+          }
+          val jobContext = newJobContext(getConf, jobId)
+          val rawSplits = inputFormat.getSplits(jobContext).toArray
+          val result = new Array[SparkPartition](rawSplits.size)
+          for (i <- 0 until rawSplits.size) {
+            result(i) =
+              new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable])
+          }
+          result
+        }
+      }
+
+    // The ordinal for the partition key in the result row, if requested.
+    val partitionKeyLocation =
+      partitionKeys
+        .headOption
+        .map(requiredColumns.indexOf(_))
+        .getOrElse(-1)
+
+    // When the data does not include the key and the key is requested then we must fill it in
+    // based on information from the input split.
+    if (!dataIncludesKey && partitionKeyLocation != -1) {
+      baseRDD.mapPartitionsWithInputSplit { case (split, iter) =>
+        val partValue = "([^=]+)=([^=]+)".r
+        val partValues =
+          split.asInstanceOf[parquet.hadoop.ParquetInputSplit]
+            .getPath
+            .toString
+            .split("/")
+            .flatMap {
+            case partValue(key, value) => Some(key -> value)
+            case _ => None
+          }.toMap
+
+        val currentValue = partValues.values.head.toInt
+        iter.map { pair =>
+          val res = pair._2.asInstanceOf[SpecificMutableRow]
+          res.setInt(partitionKeyLocation, currentValue)
+          res
+        }
+      }
+    } else {
+      baseRDD.map(_._2)
+    }
+  }
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/ParquetMetastoreSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala
similarity index 63%
rename from sql/hive/src/test/scala/org/apache/spark/sql/parquet/ParquetMetastoreSuite.scala
rename to sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala
index cc65242c0da9b..7159ebd0353ad 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/ParquetMetastoreSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala
@@ -34,71 +34,52 @@ case class ParquetDataWithKey(p: Int, intField: Int, stringField: String)
 
 
 /**
- * Tests for our SerDe -> Native parquet scan conversion.
+ * A suite to test the automatic conversion of metastore tables with parquet data to use the
+ * built in parquet support.
  */
-class ParquetMetastoreSuite extends QueryTest with BeforeAndAfterAll {
+class ParquetMetastoreSuite extends ParquetTest {
   override def beforeAll(): Unit = {
-    val partitionedTableDir = File.createTempFile("parquettests", "sparksql")
-    partitionedTableDir.delete()
-    partitionedTableDir.mkdir()
-
-    (1 to 10).foreach { p =>
-      val partDir = new File(partitionedTableDir, s"p=$p")
-      sparkContext.makeRDD(1 to 10)
-        .map(i => ParquetData(i, s"part-$p"))
-        .saveAsParquetFile(partDir.getCanonicalPath)
-    }
-
-    val partitionedTableDirWithKey = File.createTempFile("parquettests", "sparksql")
-    partitionedTableDirWithKey.delete()
-    partitionedTableDirWithKey.mkdir()
-
-    (1 to 10).foreach { p =>
-      val partDir = new File(partitionedTableDirWithKey, s"p=$p")
-      sparkContext.makeRDD(1 to 10)
-        .map(i => ParquetDataWithKey(p, i, s"part-$p"))
-        .saveAsParquetFile(partDir.getCanonicalPath)
-    }
+    super.beforeAll()
 
     sql(s"""
-    create external table partitioned_parquet
-    (
-      intField INT,
-      stringField STRING
-    )
-    PARTITIONED BY (p int)
-    ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
-     STORED AS
-     INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
-     OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
-    location '${partitionedTableDir.getCanonicalPath}'
+      create external table partitioned_parquet
+      (
+        intField INT,
+        stringField STRING
+      )
+      PARTITIONED BY (p int)
+      ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
+       STORED AS
+       INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
+       OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
+      location '${partitionedTableDir.getCanonicalPath}'
     """)
 
     sql(s"""
-    create external table partitioned_parquet_with_key
-    (
-      intField INT,
-      stringField STRING
-    )
-    PARTITIONED BY (p int)
-    ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
-     STORED AS
-     INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
-     OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
-    location '${partitionedTableDirWithKey.getCanonicalPath}'
+      create external table partitioned_parquet_with_key
+      (
+        intField INT,
+        stringField STRING
+      )
+      PARTITIONED BY (p int)
+      ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
+       STORED AS
+       INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
+       OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
+      location '${partitionedTableDirWithKey.getCanonicalPath}'
     """)
 
     sql(s"""
-    create external table normal_parquet
-    (
-      intField INT,
-      stringField STRING
-    )
-    ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
-     STORED AS
-     INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
-     OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
-    location '${new File(partitionedTableDir, "p=1").getCanonicalPath}'
+      create external table normal_parquet
+      (
+        intField INT,
+        stringField STRING
+      )
+      ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
+       STORED AS
+       INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
+       OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
+      location '${new File(partitionedTableDir, "p=1").getCanonicalPath}'
     """)
 
     (1 to 10).foreach { p =>
@@ -116,6 +97,82 @@ class ParquetMetastoreSuite extends QueryTest with BeforeAndAfterAll {
     setConf("spark.sql.hive.convertMetastoreParquet", "false")
   }
 
+  test("conversion is working") {
+    assert(
+      sql("SELECT * FROM normal_parquet").queryExecution.executedPlan.collect {
+        case _: HiveTableScan => true
+      }.isEmpty)
+    assert(
+      sql("SELECT * FROM normal_parquet").queryExecution.executedPlan.collect {
+        case _: ParquetTableScan => true
+      }.nonEmpty)
+  }
+}
+
+/**
+ * A suite of tests for the Parquet support through the data sources API.
+ */
+class ParquetSourceSuite extends ParquetTest {
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+
+    sql( s"""
+      create temporary table partitioned_parquet
+      USING org.apache.spark.sql.parquet
+      OPTIONS (
+        path '${partitionedTableDir.getCanonicalPath}'
+      )
+    """)
+
+    sql( s"""
+      create temporary table partitioned_parquet_with_key
+      USING org.apache.spark.sql.parquet
+      OPTIONS (
+        path '${partitionedTableDirWithKey.getCanonicalPath}'
+      )
+    """)
+
+    sql( s"""
+      create temporary table normal_parquet
+      USING org.apache.spark.sql.parquet
+      OPTIONS (
+        path '${new File(partitionedTableDir, "p=1").getCanonicalPath}'
+      )
+    """)
+  }
+}
+
+/**
+ * A collection of tests for parquet data with various forms of partitioning.
+ */
+abstract class ParquetTest extends QueryTest with BeforeAndAfterAll {
+  var partitionedTableDir: File = null
+  var partitionedTableDirWithKey: File = null
+
+  override def beforeAll(): Unit = {
+    partitionedTableDir = File.createTempFile("parquettests", "sparksql")
+    partitionedTableDir.delete()
+    partitionedTableDir.mkdir()
+
+    (1 to 10).foreach { p =>
+      val partDir = new File(partitionedTableDir, s"p=$p")
+      sparkContext.makeRDD(1 to 10)
+        .map(i => ParquetData(i, s"part-$p"))
+        .saveAsParquetFile(partDir.getCanonicalPath)
+    }
+
+    partitionedTableDirWithKey = File.createTempFile("parquettests", "sparksql")
+    partitionedTableDirWithKey.delete()
+    partitionedTableDirWithKey.mkdir()
+
+    (1 to 10).foreach { p =>
+      val partDir = new File(partitionedTableDirWithKey, s"p=$p")
+      sparkContext.makeRDD(1 to 10)
+        .map(i => ParquetDataWithKey(p, i, s"part-$p"))
+        .saveAsParquetFile(partDir.getCanonicalPath)
+    }
+  }
+
   Seq("partitioned_parquet", "partitioned_parquet_with_key").foreach { table =>
     test(s"project the partitioning column $table") {
       checkAnswer(
@@ -193,15 +250,4 @@ class ParquetMetastoreSuite extends QueryTest with BeforeAndAfterAll {
       sql("SELECT COUNT(*) FROM normal_parquet"),
       10)
   }
-
-  test("conversion is working") {
-    assert(
-      sql("SELECT * FROM normal_parquet").queryExecution.executedPlan.collect {
-        case _: HiveTableScan => true
-      }.isEmpty)
-    assert(
-      sql("SELECT * FROM normal_parquet").queryExecution.executedPlan.collect {
-        case _: ParquetTableScan => true
-      }.nonEmpty)
-  }
 }

From a37f6a843c2c87594e4796c102ef8b59e39a9970 Mon Sep 17 00:00:00 2001
From: w00228970 <wangfei1@huawei.com>
Date: Thu, 20 Nov 2014 09:56:06 +0800
Subject: [PATCH 27/32] wf comment

---
 .../apache/spark/sql/parquet/newParquet.scala | 288 +++++-------------
 1 file changed, 72 insertions(+), 216 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
index c888a706c10cd..a08326a698f2b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -1,291 +1,147 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
 package org.apache.spark.sql.parquet
 
-import java.util.{List => JList}
-
-import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
-import org.apache.hadoop.conf.{Configurable, Configuration}
-import org.apache.hadoop.io.Writable
-import org.apache.hadoop.mapreduce.{JobContext, InputSplit, Job}
-
-import parquet.hadoop.ParquetInputFormat
-import parquet.hadoop.util.ContextUtil
-
-import org.apache.spark.annotation.DeveloperApi
-import org.apache.spark.{Partition => SparkPartition, Logging}
-import org.apache.spark.rdd.{NewHadoopPartition, RDD}
+import org.apache.hadoop.fs.{FileSystem, Path}
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.mapreduce.Job
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
+import org.apache.spark.Logging
+import org.apache.spark.rdd.RDD
 
 import org.apache.spark.sql.{SQLConf, Row, SQLContext}
-import org.apache.spark.sql.catalyst.expressions.{SpecificMutableRow, And, Expression, Attribute}
-import org.apache.spark.sql.catalyst.types.{IntegerType, StructField, StructType}
+import org.apache.spark.sql.catalyst.types.StructType
 import org.apache.spark.sql.sources._
+import parquet.hadoop.ParquetInputFormat
+import parquet.hadoop.util.ContextUtil
 
-import scala.collection.JavaConversions._
-
-/**
- * Allows creation of parquet based tables using the syntax
- * `CREATE TABLE ... USING org.apache.spark.sql.parquet`.  Currently the only option required
- * is `path`, which is should be the location of a collection of, optionally partitioned,
- * parquet files.
- */
 class DefaultSource extends RelationProvider {
   /** Returns a new base relation with the given parameters. */
-  override def createRelation(
-      sqlContext: SQLContext,
-      parameters: Map[String, String]): BaseRelation = {
-    val path =
-      parameters.getOrElse("path", sys.error("'path' must be specifed for parquet tables."))
-
-    ParquetRelation2(path)(sqlContext)
+  override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = {
+    ParquetRelation2(parameters("path"))(sqlContext)
   }
 }
 
-private[parquet] case class Partition(partitionValues: Map[String, Any], files: Seq[FileStatus])
+case class Partition(partitioningKeys: Map[String, Any], files: Seq[String])
 
-/**
- * An alternative to [[ParquetRelation]] that plugs in using the data sources API.  This class is
- * currently not intended as a full replacement of the parquet support in Spark SQL though it is
- * likely that it will eventually subsume the existing physical plan implementation.
- *
- * Compared with the current implementation, this class has the following notable differences:
- *
- * Partitioning: Partitions are auto discovered and must be in the form of directories `key=value/`
- * located at `path`.  Currently only a single partitioning column is supported and it must
- * be an integer.  This class supports both fully self-describing data, which contains the partition
- * key, and data where the partition key is only present in the folder structure.  The presence
- * of the partitioning key in the data is also auto-detected.
- *
- * Metadata: The metadata is automatically discovered by reading the first parquet file present.
- * There is currently no support for working with files that have different schema.  Additionally,
- * when parquet metadata caching is turned on, the FileStatus objects for all data will be cached
- * to improve the speed of interactive querying.  When data is added to a table it must be dropped
- * and recreated to pick up any changes.
- *
- * Statistics: Statistics for the size of the table are automatically populated during metadata
- * discovery.
- */
-@DeveloperApi
-case class ParquetRelation2(path: String)(@transient val sqlContext: SQLContext)
-  extends CatalystScan with Logging {
+case class ParquetRelation2(
+    path: String)(
+    @transient val sqlContext: SQLContext) extends PrunedFilteredScan with Logging {
 
   def sparkContext = sqlContext.sparkContext
 
-  // Minor Hack: scala doesnt seem to respect @transient for vals declared via extraction
-  @transient
-  private var partitionKeys: Seq[String] = _
-  @transient
-  private var partitions: Seq[Partition] = _
-  discoverPartitions()
-
-  // TODO: Only finds the first partition, assumes the key is of type Integer...
-  private def discoverPartitions() = {
+  val (partitionKeys: Seq[String], partitions: Seq[Partition]) = {
     val fs = FileSystem.get(new java.net.URI(path), sparkContext.hadoopConfiguration)
     val partValue = "([^=]+)=([^=]+)".r
 
-    val childrenOfPath = fs.listStatus(new Path(path)).filterNot(_.getPath.getName.startsWith("_"))
-    val childDirs = childrenOfPath.filter(s => s.isDir)
-
-    if (childDirs.size > 0) {
-      val partitionPairs = childDirs.map(_.getPath.getName).map {
-        case partValue(key, value) => (key, value)
-      }
+    val folders = fs.listStatus(new Path(path)).filter(s => s.isDir).map(_.getPath.getName).map {
+      case partValue(key, value) => (key, value)
+    }
 
-      val foundKeys = partitionPairs.map(_._1).distinct
-      if (foundKeys.size > 1) {
-        sys.error(s"Too many distinct partition keys: $foundKeys")
+    if (folders.size > 0) {
+      val partitionKeys = folders.map(_._1).distinct
+      if (partitionKeys.size > 1) {
+        sys.error(s"Too many distinct partition keys: $partitionKeys")
       }
 
-      // Do a parallel lookup of partition metadata.
-      val partitionFiles =
-        childDirs.par
-          .map(d => fs.listStatus(d.getPath).filterNot(_.getPath.getName.startsWith("_"))).seq
-
-      partitionKeys = foundKeys.toSeq
-      partitions = partitionFiles.zip(partitionPairs).map { case (files, (key, value)) =>
+      (partitionKeys.toSeq, folders.map { case (key, value) =>
+        val files = s"$path/$key=$value" :: Nil
         Partition(Map(key -> value.toInt), files)
-      }.toSeq
-    } else {
-      partitionKeys = Nil
-      partitions = Partition(Map.empty, childrenOfPath) :: Nil
-    }
-  }
-
-  override val sizeInBytes = partitions.flatMap(_.files).map(_.getLen).sum
-
-  val dataSchema = StructType.fromAttributes(// TODO: Parquet code should not deal with attributes.
-    ParquetTypesConverter.readSchemaFromFile(
-      partitions.head.files.head.getPath,
-      Some(sparkContext.hadoopConfiguration),
-      sqlContext.isParquetBinaryAsString))
-
-  val dataIncludesKey =
-    partitionKeys.headOption.map(dataSchema.fieldNames.contains(_)).getOrElse(true)
+      }.toSeq)
 
-  override val schema =
-    if (dataIncludesKey) {
-      dataSchema
     } else {
-      StructType(
-        StructField(partitionKeys.head, IntegerType) +:
-        dataSchema.fields
-      )
+      (Nil, Partition(Map.empty, path :: Nil) :: Nil)
     }
+  }
 
-  override def buildScan(output: Seq[Attribute], predicates: Seq[Expression]): RDD[Row] = {
-    // This is mostly a hack so that we can use the existing parquet filter code.
-    val requiredColumns = output.map(_.name)
-    // TODO: Parquet filters should be based on data sources API, not catalyst expressions.
-    val filters = DataSourceStrategy.selectFilters(predicates)
+  val schema =
+    StructType.fromAttributes(
+      ParquetTypesConverter.readSchemaFromFile(
+        new Path(partitions.head.files.head),
+        None,  // TODO
+        sqlContext.isParquetBinaryAsString))
 
+  override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = {
     val job = new Job(sparkContext.hadoopConfiguration)
     ParquetInputFormat.setReadSupportClass(job, classOf[RowReadSupport])
-    val jobConf: Configuration = ContextUtil.getConfiguration(job)
+    val conf: Configuration = ContextUtil.getConfiguration(job)
 
     val requestedSchema = StructType(requiredColumns.map(schema(_)))
 
-    // TODO: Make folder based partitioning a first class citizen of the Data Sources API.
     val partitionFilters = filters.collect {
       case e @ EqualTo(attr, value) if partitionKeys.contains(attr) =>
         logInfo(s"Parquet scan partition filter: $attr=$value")
-        (p: Partition) => p.partitionValues(attr) == value
+        (p: Partition) => p.partitioningKeys(attr) == value
 
       case e @ In(attr, values) if partitionKeys.contains(attr) =>
         logInfo(s"Parquet scan partition filter: $attr IN ${values.mkString("{", ",", "}")}")
         val set = values.toSet
-        (p: Partition) => set.contains(p.partitionValues(attr))
+        (p: Partition) => set.contains(p.partitioningKeys(attr))
 
       case e @ GreaterThan(attr, value) if partitionKeys.contains(attr) =>
         logInfo(s"Parquet scan partition filter: $attr > $value")
-        (p: Partition) => p.partitionValues(attr).asInstanceOf[Int] > value.asInstanceOf[Int]
+        (p: Partition) => p.partitioningKeys(attr).asInstanceOf[Int] > value.asInstanceOf[Int]
 
       case e @ GreaterThanOrEqual(attr, value) if partitionKeys.contains(attr) =>
         logInfo(s"Parquet scan partition filter: $attr >= $value")
-        (p: Partition) => p.partitionValues(attr).asInstanceOf[Int] >= value.asInstanceOf[Int]
+        (p: Partition) => p.partitioningKeys(attr).asInstanceOf[Int] >= value.asInstanceOf[Int]
 
       case e @ LessThan(attr, value) if partitionKeys.contains(attr) =>
         logInfo(s"Parquet scan partition filter: $attr < $value")
-        (p: Partition) => p.partitionValues(attr).asInstanceOf[Int] < value.asInstanceOf[Int]
+        (p: Partition) => p.partitioningKeys(attr).asInstanceOf[Int] < value.asInstanceOf[Int]
 
       case e @ LessThanOrEqual(attr, value) if partitionKeys.contains(attr) =>
         logInfo(s"Parquet scan partition filter: $attr <= $value")
-        (p: Partition) => p.partitionValues(attr).asInstanceOf[Int] <= value.asInstanceOf[Int]
+        (p: Partition) => p.partitioningKeys(attr).asInstanceOf[Int] <= value.asInstanceOf[Int]
     }
 
     val selectedPartitions = partitions.filter(p => partitionFilters.forall(_(p)))
 
-    selectedPartitions.flatMap(_.files).foreach { status =>
-      val path = status.getPath
-      val qualifiedPath = path.getFileSystem(jobConf).makeQualified(path)
+    selectedPartitions.flatMap(_.files).foreach { curPath =>
+      val qualifiedPath = {
+        val path = new Path(curPath)
+        path.getFileSystem(conf).makeQualified(path)
+      }
       org.apache.hadoop.mapreduce.lib.input.FileInputFormat.addInputPath(job, qualifiedPath)
     }
 
-    // Push down filters when possible
-    predicates
-      .reduceOption(And)
-      .flatMap(ParquetFilters.createFilter)
-      .filter(_ => sqlContext.parquetFilterPushDown)
-      .foreach(ParquetInputFormat.setFilterPredicate(jobConf, _))
-
-    def percentRead = selectedPartitions.size.toDouble / partitions.size.toDouble * 100
-    logInfo(s"Reading $percentRead% of $path partitions")
+    def fractionRead = selectedPartitions.size.toDouble / partitions.size.toDouble * 100
+    logInfo(s"Reading $fractionRead% of $path partitions")
 
     // Store both requested and original schema in `Configuration`
-    jobConf.set(
+    conf.set(
       RowReadSupport.SPARK_ROW_REQUESTED_SCHEMA,
       ParquetTypesConverter.convertToString(requestedSchema.toAttributes))
-    jobConf.set(
+    conf.set(
       RowWriteSupport.SPARK_ROW_SCHEMA,
       ParquetTypesConverter.convertToString(schema.toAttributes))
 
     // Tell FilteringParquetRowInputFormat whether it's okay to cache Parquet and FS metadata
-    val useCache = sqlContext.getConf(SQLConf.PARQUET_CACHE_METADATA, "true").toBoolean
-    jobConf.set(SQLConf.PARQUET_CACHE_METADATA, useCache.toString)
+    conf.set(
+      SQLConf.PARQUET_CACHE_METADATA,
+      sqlContext.getConf(SQLConf.PARQUET_CACHE_METADATA, "true"))
 
     val baseRDD =
       new org.apache.spark.rdd.NewHadoopRDD(
-          sparkContext,
-          classOf[FilteringParquetRowInputFormat],
-          classOf[Void],
-          classOf[Row],
-          jobConf) {
-        val cacheMetadata = useCache
-
-        @transient
-        val cachedStatus = selectedPartitions.flatMap(_.files)
+        sparkContext,
+        classOf[FilteringParquetRowInputFormat],
+        classOf[Void],
+        classOf[Row],
+        conf)
 
-        // Overridden so we can inject our own cached files statuses.
-        override def getPartitions: Array[SparkPartition] = {
-          val inputFormat =
-            if (cacheMetadata) {
-              new FilteringParquetRowInputFormat {
-                override def listStatus(jobContext: JobContext): JList[FileStatus] = cachedStatus
-              }
-            } else {
-              new FilteringParquetRowInputFormat
-            }
+    baseRDD.map(_._2)
+  }
 
-          inputFormat match {
-            case configurable: Configurable =>
-              configurable.setConf(getConf)
-            case _ =>
-          }
-          val jobContext = newJobContext(getConf, jobId)
-          val rawSplits = inputFormat.getSplits(jobContext).toArray
-          val result = new Array[SparkPartition](rawSplits.size)
-          for (i <- 0 until rawSplits.size) {
-            result(i) =
-              new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable])
-          }
-          result
-        }
+  override val sizeInBytes = {
+    def calculateTableSize(fs: FileSystem, path: Path): Long = {
+      val fileStatus = fs.getFileStatus(path)
+      val size = if (fileStatus.isDir) {
+        fs.listStatus(path).map(status => calculateTableSize(fs, status.getPath)).sum
+      } else {
+        fileStatus.getLen
       }
 
-    // The ordinal for the partition key in the result row, if requested.
-    val partitionKeyLocation =
-      partitionKeys
-        .headOption
-        .map(requiredColumns.indexOf(_))
-        .getOrElse(-1)
-
-    // When the data does not include the key and the key is requested then we must fill it in
-    // based on information from the input split.
-    if (!dataIncludesKey && partitionKeyLocation != -1) {
-      baseRDD.mapPartitionsWithInputSplit { case (split, iter) =>
-        val partValue = "([^=]+)=([^=]+)".r
-        val partValues =
-          split.asInstanceOf[parquet.hadoop.ParquetInputSplit]
-            .getPath
-            .toString
-            .split("/")
-            .flatMap {
-            case partValue(key, value) => Some(key -> value)
-            case _ => None
-          }.toMap
-
-        val currentValue = partValues.values.head.toInt
-        iter.map { pair =>
-          val res = pair._2.asInstanceOf[SpecificMutableRow]
-          res.setInt(partitionKeyLocation, currentValue)
-          res
-        }
-      }
-    } else {
-      baseRDD.map(_._2)
+      size
     }
+    val fs = FileSystem.get(new java.net.URI(path), sparkContext.hadoopConfiguration)
+    calculateTableSize(fs, new Path(path))
   }
-}
+}
\ No newline at end of file

From e521e6a77ba33477a6a1c60769ecd0ef0107eeb3 Mon Sep 17 00:00:00 2001
From: w00228970 <wangfei1@huawei.com>
Date: Thu, 20 Nov 2014 13:00:21 +0800
Subject: [PATCH 28/32] draft for datasource api

---
 docs/sql-programming-guide.md                 |  90 ++++
 .../apache/spark/sql/parquet/NewParquet.scala | 147 ------
 .../sql/parquet/ParquetTableOperations.scala  |  37 +-
 .../apache/spark/sql/parquet/newParquet.scala | 440 ++++++++++++------
 .../org/apache/spark/sql/hive/orc/orc.scala   | 245 ++++++++++
 .../data/files/orcfiles/.part-r-1.orc.crc     | Bin 0 -> 20 bytes
 .../data/files/orcfiles/.part-r-2.orc.crc     | Bin 0 -> 20 bytes
 .../resources/data/files/orcfiles/_SUCCESS    |   0
 .../data/files/orcfiles/part-r-1.orc          | Bin 0 -> 1296 bytes
 .../data/files/orcfiles/part-r-2.orc          | Bin 0 -> 1286 bytes
 .../spark/sql/hive/orc/ORCQuerySuite.scala    | 244 ++++++++++
 11 files changed, 900 insertions(+), 303 deletions(-)
 delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/parquet/NewParquet.scala
 create mode 100644 sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/orc.scala
 create mode 100644 sql/hive/src/test/resources/data/files/orcfiles/.part-r-1.orc.crc
 create mode 100644 sql/hive/src/test/resources/data/files/orcfiles/.part-r-2.orc.crc
 create mode 100644 sql/hive/src/test/resources/data/files/orcfiles/_SUCCESS
 create mode 100644 sql/hive/src/test/resources/data/files/orcfiles/part-r-1.orc
 create mode 100644 sql/hive/src/test/resources/data/files/orcfiles/part-r-2.orc
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/ORCQuerySuite.scala

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 5500da83b2b66..0799ab17c574f 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -605,6 +605,96 @@ Configuration of Parquet can be done using the `setConf` method on SQLContext or
 </tr>
 </table>
 
+## ORC Files
+
+[ORC](https://cwiki.apache.org/confluence/display/Hive/LanguageManual+ORC) is a columnar format that is supported in Hive, it provides a highly efficient way to store data on HDFS to speed up query performance.
+Spark SQL provides support for both reading and writing ORC files that automatically preserves the schema
+of the original data.
+
+### Loading Data Programmatically
+
+Using the data from the above example:
+
+<div class="codetabs">
+
+<div data-lang="scala"  markdown="1">
+
+{% highlight scala %}
+// Use HiveContext to read or write ORC File.
+val sqlContext = new HiveContext(sc)
+import sqlContext._
+val people: RDD[Person] = ... // An RDD of case class objects, from the previous example.
+
+// The RDD is implicitly converted to a SchemaRDD by createSchemaRDD, allowing it to be stored using ORC.
+rdd.registerTempTable("people")
+rdd.saveAsOrcFile("people.orc")
+
+// Read in the ORC file created above.  ORC files are self-describing so the schema is preserved.
+// The result of loading a ORC file is also a SchemaRDD.
+val orcFile = hiveContext.orcFile("pair.orc")
+
+//ORC files can also be registered as tables and then used in SQL statements.
+orcFile.registerTempTable("orcFile")
+val teenagers = sqlContext.sql("SELECT name FROM orcFile WHERE age >= 13 AND age <= 19")
+teenagers.map(t => "Name: " + t(0)).collect().foreach(println)
+{% endhighlight %}
+
+</div>
+
+<div data-lang="java"  markdown="1">
+
+{% highlight java %}
+// Use JavaHiveContext to read or write ORC File.
+JavaHiveContext sqlContext = new org.apache.spark.sql.hive.api.java.HiveContext(sc);
+JavaSchemaRDD schemaPeople = ... // The JavaSchemaRDD from the previous example.
+
+// JavaSchemaRDDs can be saved as ORC files, maintaining the schema information.
+schemaPeople.saveAsOrcFile("people.orc");
+
+// Read in the ORC file created above.  ORC files are self-describing so the schema is preserved.
+// The result of loading a ORC file is also a JavaSchemaRDD.
+JavaSchemaRDD orcFile = sqlContext.orcFile("people.orc");
+
+// ORC files can also be registered as tables and then used in SQL statements.
+orcFile.registerTempTable("orcFile");
+JavaSchemaRDD teenagers = sqlContext.sql("SELECT name FROM orcFile WHERE age >= 13 AND age <= 19");
+List<String> teenagerNames = teenagers.map(new Function<Row, String>() {
+  public String call(Row row) {
+    return "Name: " + row.getString(0);
+  }
+}).collect();
+{% endhighlight %}
+
+</div>
+
+<div data-lang="python"  markdown="1">
+
+{% highlight python %}
+# Use HiveContext to read or write ORC File.
+from pyspark.sql import HiveContext
+sqlContext = HiveContext(sc)
+
+schemaPeople # The SchemaRDD from the previous example.
+
+# SchemaRDDs can be saved as ORC files, maintaining the schema information.
+schemaPeople.saveAsOrcFile("people.orc")
+
+# Read in the ORC file created above. ORC files are self-describing so the schema is preserved.
+# The result of loading a ORC file is also a SchemaRDD.
+orcFile = sqlContext.orcFile("people.orc")
+
+# ORC files can also be registered as tables and then used in SQL statements.
+orcFile.registerTempTable("orcFile");
+teenagers = sqlContext.sql("SELECT name FROM orcFile WHERE age >= 13 AND age <= 19")
+teenNames = teenagers.map(lambda p: "Name: " + p.name)
+for teenName in teenNames.collect():
+  print teenName
+{% endhighlight %}
+
+</div>
+
+</div>
+
 ## JSON Datasets
 <div class="codetabs">
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/NewParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/NewParquet.scala
deleted file mode 100644
index a08326a698f2b..0000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/NewParquet.scala
+++ /dev/null
@@ -1,147 +0,0 @@
-package org.apache.spark.sql.parquet
-
-import org.apache.hadoop.fs.{FileSystem, Path}
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.mapreduce.Job
-import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
-import org.apache.spark.Logging
-import org.apache.spark.rdd.RDD
-
-import org.apache.spark.sql.{SQLConf, Row, SQLContext}
-import org.apache.spark.sql.catalyst.types.StructType
-import org.apache.spark.sql.sources._
-import parquet.hadoop.ParquetInputFormat
-import parquet.hadoop.util.ContextUtil
-
-class DefaultSource extends RelationProvider {
-  /** Returns a new base relation with the given parameters. */
-  override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = {
-    ParquetRelation2(parameters("path"))(sqlContext)
-  }
-}
-
-case class Partition(partitioningKeys: Map[String, Any], files: Seq[String])
-
-case class ParquetRelation2(
-    path: String)(
-    @transient val sqlContext: SQLContext) extends PrunedFilteredScan with Logging {
-
-  def sparkContext = sqlContext.sparkContext
-
-  val (partitionKeys: Seq[String], partitions: Seq[Partition]) = {
-    val fs = FileSystem.get(new java.net.URI(path), sparkContext.hadoopConfiguration)
-    val partValue = "([^=]+)=([^=]+)".r
-
-    val folders = fs.listStatus(new Path(path)).filter(s => s.isDir).map(_.getPath.getName).map {
-      case partValue(key, value) => (key, value)
-    }
-
-    if (folders.size > 0) {
-      val partitionKeys = folders.map(_._1).distinct
-      if (partitionKeys.size > 1) {
-        sys.error(s"Too many distinct partition keys: $partitionKeys")
-      }
-
-      (partitionKeys.toSeq, folders.map { case (key, value) =>
-        val files = s"$path/$key=$value" :: Nil
-        Partition(Map(key -> value.toInt), files)
-      }.toSeq)
-
-    } else {
-      (Nil, Partition(Map.empty, path :: Nil) :: Nil)
-    }
-  }
-
-  val schema =
-    StructType.fromAttributes(
-      ParquetTypesConverter.readSchemaFromFile(
-        new Path(partitions.head.files.head),
-        None,  // TODO
-        sqlContext.isParquetBinaryAsString))
-
-  override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = {
-    val job = new Job(sparkContext.hadoopConfiguration)
-    ParquetInputFormat.setReadSupportClass(job, classOf[RowReadSupport])
-    val conf: Configuration = ContextUtil.getConfiguration(job)
-
-    val requestedSchema = StructType(requiredColumns.map(schema(_)))
-
-    val partitionFilters = filters.collect {
-      case e @ EqualTo(attr, value) if partitionKeys.contains(attr) =>
-        logInfo(s"Parquet scan partition filter: $attr=$value")
-        (p: Partition) => p.partitioningKeys(attr) == value
-
-      case e @ In(attr, values) if partitionKeys.contains(attr) =>
-        logInfo(s"Parquet scan partition filter: $attr IN ${values.mkString("{", ",", "}")}")
-        val set = values.toSet
-        (p: Partition) => set.contains(p.partitioningKeys(attr))
-
-      case e @ GreaterThan(attr, value) if partitionKeys.contains(attr) =>
-        logInfo(s"Parquet scan partition filter: $attr > $value")
-        (p: Partition) => p.partitioningKeys(attr).asInstanceOf[Int] > value.asInstanceOf[Int]
-
-      case e @ GreaterThanOrEqual(attr, value) if partitionKeys.contains(attr) =>
-        logInfo(s"Parquet scan partition filter: $attr >= $value")
-        (p: Partition) => p.partitioningKeys(attr).asInstanceOf[Int] >= value.asInstanceOf[Int]
-
-      case e @ LessThan(attr, value) if partitionKeys.contains(attr) =>
-        logInfo(s"Parquet scan partition filter: $attr < $value")
-        (p: Partition) => p.partitioningKeys(attr).asInstanceOf[Int] < value.asInstanceOf[Int]
-
-      case e @ LessThanOrEqual(attr, value) if partitionKeys.contains(attr) =>
-        logInfo(s"Parquet scan partition filter: $attr <= $value")
-        (p: Partition) => p.partitioningKeys(attr).asInstanceOf[Int] <= value.asInstanceOf[Int]
-    }
-
-    val selectedPartitions = partitions.filter(p => partitionFilters.forall(_(p)))
-
-    selectedPartitions.flatMap(_.files).foreach { curPath =>
-      val qualifiedPath = {
-        val path = new Path(curPath)
-        path.getFileSystem(conf).makeQualified(path)
-      }
-      org.apache.hadoop.mapreduce.lib.input.FileInputFormat.addInputPath(job, qualifiedPath)
-    }
-
-    def fractionRead = selectedPartitions.size.toDouble / partitions.size.toDouble * 100
-    logInfo(s"Reading $fractionRead% of $path partitions")
-
-    // Store both requested and original schema in `Configuration`
-    conf.set(
-      RowReadSupport.SPARK_ROW_REQUESTED_SCHEMA,
-      ParquetTypesConverter.convertToString(requestedSchema.toAttributes))
-    conf.set(
-      RowWriteSupport.SPARK_ROW_SCHEMA,
-      ParquetTypesConverter.convertToString(schema.toAttributes))
-
-    // Tell FilteringParquetRowInputFormat whether it's okay to cache Parquet and FS metadata
-    conf.set(
-      SQLConf.PARQUET_CACHE_METADATA,
-      sqlContext.getConf(SQLConf.PARQUET_CACHE_METADATA, "true"))
-
-    val baseRDD =
-      new org.apache.spark.rdd.NewHadoopRDD(
-        sparkContext,
-        classOf[FilteringParquetRowInputFormat],
-        classOf[Void],
-        classOf[Row],
-        conf)
-
-    baseRDD.map(_._2)
-  }
-
-  override val sizeInBytes = {
-    def calculateTableSize(fs: FileSystem, path: Path): Long = {
-      val fileStatus = fs.getFileStatus(path)
-      val size = if (fileStatus.isDir) {
-        fs.listStatus(path).map(status => calculateTableSize(fs, status.getPath)).sum
-      } else {
-        fileStatus.getLen
-      }
-
-      size
-    }
-    val fs = FileSystem.get(new java.net.URI(path), sparkContext.hadoopConfiguration)
-    calculateTableSize(fs, new Path(path))
-  }
-}
\ No newline at end of file
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
index 719b37705154e..01b7104a669fc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
@@ -287,7 +287,8 @@ case class InsertIntoParquetTable(
         1
       } else {
         FileSystemHelper
-          .findMaxTaskId(NewFileOutputFormat.getOutputPath(job).toString, job.getConfiguration) + 1
+          .findMaxTaskId(
+            NewFileOutputFormat.getOutputPath(job).toString, job.getConfiguration, "parquet") + 1
       }
 
     def writeShard(context: TaskContext, iter: Iterator[Row]): Int = {
@@ -624,7 +625,7 @@ private[parquet] object FilteringParquetRowInputFormat {
     .build[FileStatus, Array[BlockLocation]]()
 }
 
-private[parquet] object FileSystemHelper {
+private[sql] object FileSystemHelper {
   def listFiles(pathStr: String, conf: Configuration): Seq[Path] = {
     val origPath = new Path(pathStr)
     val fs = origPath.getFileSystem(conf)
@@ -640,19 +641,37 @@ private[parquet] object FileSystemHelper {
     fs.listStatus(path).map(_.getPath)
   }
 
-    /**
-     * Finds the maximum taskid in the output file names at the given path.
-     */
-  def findMaxTaskId(pathStr: String, conf: Configuration): Int = {
+  /**
+   *  List files with special extension
+   */
+  def listFiles(origPath: Path, conf: Configuration, extension: String): Seq[Path] = {
+    val fs = origPath.getFileSystem(conf)
+    if (fs == null) {
+      throw new IllegalArgumentException(
+        s"Path $origPath is incorrectly formatted")
+    }
+    val path = origPath.makeQualified(fs)
+    if (fs.exists(path)) {
+      fs.listStatus(path).map(_.getPath).filter(p => p.getName.endsWith(extension))
+    } else {
+      Seq.empty
+    }
+  }
+
+  /**
+   * Finds the maximum taskid in the output file names at the given path.
+   */
+  def findMaxTaskId(pathStr: String, conf: Configuration, extension: String): Int = {
+    // filename pattern is part-r-<int>.$extension
+    require(Seq("orc", "parquet").contains(extension), s"Unsupported extension: $extension")
+    val nameP = new scala.util.matching.Regex(s"""part-r-(\\d{1,}).$extension""", "taskid")
     val files = FileSystemHelper.listFiles(pathStr, conf)
-    // filename pattern is part-r-<int>.parquet
-    val nameP = new scala.util.matching.Regex("""part-r-(\d{1,}).parquet""", "taskid")
     val hiddenFileP = new scala.util.matching.Regex("_.*")
     files.map(_.getName).map {
       case nameP(taskid) => taskid.toInt
       case hiddenFileP() => 0
       case other: String =>
-        sys.error("ERROR: attempting to append to set of Parquet files and found file" +
+        sys.error(s"ERROR: attempting to append to set of $extension files and found file" +
           s"that does not match name pattern: $other")
       case _ => 0
     }.reduceLeft((a, b) => if (a < b) b else a)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
index a08326a698f2b..cd4f78ba57fb2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -1,147 +1,293 @@
-package org.apache.spark.sql.parquet
-
-import org.apache.hadoop.fs.{FileSystem, Path}
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.mapreduce.Job
-import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
-import org.apache.spark.Logging
-import org.apache.spark.rdd.RDD
-
-import org.apache.spark.sql.{SQLConf, Row, SQLContext}
-import org.apache.spark.sql.catalyst.types.StructType
-import org.apache.spark.sql.sources._
-import parquet.hadoop.ParquetInputFormat
-import parquet.hadoop.util.ContextUtil
-
-class DefaultSource extends RelationProvider {
-  /** Returns a new base relation with the given parameters. */
-  override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = {
-    ParquetRelation2(parameters("path"))(sqlContext)
-  }
-}
-
-case class Partition(partitioningKeys: Map[String, Any], files: Seq[String])
-
-case class ParquetRelation2(
-    path: String)(
-    @transient val sqlContext: SQLContext) extends PrunedFilteredScan with Logging {
-
-  def sparkContext = sqlContext.sparkContext
-
-  val (partitionKeys: Seq[String], partitions: Seq[Partition]) = {
-    val fs = FileSystem.get(new java.net.URI(path), sparkContext.hadoopConfiguration)
-    val partValue = "([^=]+)=([^=]+)".r
-
-    val folders = fs.listStatus(new Path(path)).filter(s => s.isDir).map(_.getPath.getName).map {
-      case partValue(key, value) => (key, value)
-    }
-
-    if (folders.size > 0) {
-      val partitionKeys = folders.map(_._1).distinct
-      if (partitionKeys.size > 1) {
-        sys.error(s"Too many distinct partition keys: $partitionKeys")
-      }
-
-      (partitionKeys.toSeq, folders.map { case (key, value) =>
-        val files = s"$path/$key=$value" :: Nil
-        Partition(Map(key -> value.toInt), files)
-      }.toSeq)
-
-    } else {
-      (Nil, Partition(Map.empty, path :: Nil) :: Nil)
-    }
-  }
-
-  val schema =
-    StructType.fromAttributes(
-      ParquetTypesConverter.readSchemaFromFile(
-        new Path(partitions.head.files.head),
-        None,  // TODO
-        sqlContext.isParquetBinaryAsString))
-
-  override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = {
-    val job = new Job(sparkContext.hadoopConfiguration)
-    ParquetInputFormat.setReadSupportClass(job, classOf[RowReadSupport])
-    val conf: Configuration = ContextUtil.getConfiguration(job)
-
-    val requestedSchema = StructType(requiredColumns.map(schema(_)))
-
-    val partitionFilters = filters.collect {
-      case e @ EqualTo(attr, value) if partitionKeys.contains(attr) =>
-        logInfo(s"Parquet scan partition filter: $attr=$value")
-        (p: Partition) => p.partitioningKeys(attr) == value
-
-      case e @ In(attr, values) if partitionKeys.contains(attr) =>
-        logInfo(s"Parquet scan partition filter: $attr IN ${values.mkString("{", ",", "}")}")
-        val set = values.toSet
-        (p: Partition) => set.contains(p.partitioningKeys(attr))
-
-      case e @ GreaterThan(attr, value) if partitionKeys.contains(attr) =>
-        logInfo(s"Parquet scan partition filter: $attr > $value")
-        (p: Partition) => p.partitioningKeys(attr).asInstanceOf[Int] > value.asInstanceOf[Int]
-
-      case e @ GreaterThanOrEqual(attr, value) if partitionKeys.contains(attr) =>
-        logInfo(s"Parquet scan partition filter: $attr >= $value")
-        (p: Partition) => p.partitioningKeys(attr).asInstanceOf[Int] >= value.asInstanceOf[Int]
-
-      case e @ LessThan(attr, value) if partitionKeys.contains(attr) =>
-        logInfo(s"Parquet scan partition filter: $attr < $value")
-        (p: Partition) => p.partitioningKeys(attr).asInstanceOf[Int] < value.asInstanceOf[Int]
-
-      case e @ LessThanOrEqual(attr, value) if partitionKeys.contains(attr) =>
-        logInfo(s"Parquet scan partition filter: $attr <= $value")
-        (p: Partition) => p.partitioningKeys(attr).asInstanceOf[Int] <= value.asInstanceOf[Int]
-    }
-
-    val selectedPartitions = partitions.filter(p => partitionFilters.forall(_(p)))
-
-    selectedPartitions.flatMap(_.files).foreach { curPath =>
-      val qualifiedPath = {
-        val path = new Path(curPath)
-        path.getFileSystem(conf).makeQualified(path)
-      }
-      org.apache.hadoop.mapreduce.lib.input.FileInputFormat.addInputPath(job, qualifiedPath)
-    }
-
-    def fractionRead = selectedPartitions.size.toDouble / partitions.size.toDouble * 100
-    logInfo(s"Reading $fractionRead% of $path partitions")
-
-    // Store both requested and original schema in `Configuration`
-    conf.set(
-      RowReadSupport.SPARK_ROW_REQUESTED_SCHEMA,
-      ParquetTypesConverter.convertToString(requestedSchema.toAttributes))
-    conf.set(
-      RowWriteSupport.SPARK_ROW_SCHEMA,
-      ParquetTypesConverter.convertToString(schema.toAttributes))
-
-    // Tell FilteringParquetRowInputFormat whether it's okay to cache Parquet and FS metadata
-    conf.set(
-      SQLConf.PARQUET_CACHE_METADATA,
-      sqlContext.getConf(SQLConf.PARQUET_CACHE_METADATA, "true"))
-
-    val baseRDD =
-      new org.apache.spark.rdd.NewHadoopRDD(
-        sparkContext,
-        classOf[FilteringParquetRowInputFormat],
-        classOf[Void],
-        classOf[Row],
-        conf)
-
-    baseRDD.map(_._2)
-  }
-
-  override val sizeInBytes = {
-    def calculateTableSize(fs: FileSystem, path: Path): Long = {
-      val fileStatus = fs.getFileStatus(path)
-      val size = if (fileStatus.isDir) {
-        fs.listStatus(path).map(status => calculateTableSize(fs, status.getPath)).sum
-      } else {
-        fileStatus.getLen
-      }
-
-      size
-    }
-    val fs = FileSystem.get(new java.net.URI(path), sparkContext.hadoopConfiguration)
-    calculateTableSize(fs, new Path(path))
-  }
-}
\ No newline at end of file
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.parquet
+
+import java.util.{List => JList}
+
+import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
+import org.apache.hadoop.conf.{Configurable, Configuration}
+import org.apache.hadoop.io.Writable
+import org.apache.hadoop.mapreduce.{JobContext, InputSplit, Job}
+
+import parquet.hadoop.ParquetInputFormat
+import parquet.hadoop.util.ContextUtil
+
+import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.{Partition => SparkPartition, Logging}
+import org.apache.spark.rdd.{NewHadoopPartition, RDD}
+
+import org.apache.spark.sql.{SQLConf, Row, SQLContext}
+import org.apache.spark.sql.catalyst.expressions.{SpecificMutableRow, And, Expression, Attribute}
+import org.apache.spark.sql.catalyst.types.{IntegerType, StructField, StructType}
+import org.apache.spark.sql.sources._
+
+import scala.collection.JavaConversions._
+
+/**
+ * Allows creation of parquet based tables using the syntax
+ * `CREATE TABLE ... USING org.apache.spark.sql.parquet`.  Currently the only option required
+ * is `path`, which should be the location of a collection of, optionally partitioned,
+ * parquet files.
+ */
+class DefaultSource extends RelationProvider {
+  /** Returns a new base relation with the given parameters. */
+  override def createRelation(
+      sqlContext: SQLContext,
+      parameters: Map[String, String]): BaseRelation = {
+    val path =
+      parameters.getOrElse("path", sys.error("'path' must be specifed for parquet tables."))
+
+    ParquetRelation2(path)(sqlContext)
+  }
+}
+
+private[parquet] case class Partition(partitionValues: Map[String, Any], files: Seq[FileStatus])
+
+/**
+ * An alternative to [[ParquetRelation]] that plugs in using the data sources API.  This class is
+ * currently not intended as a full replacement of the parquet support in Spark SQL though it is
+ * likely that it will eventually subsume the existing physical plan implementation.
+ *
+ * Compared with the current implementation, this class has the following notable differences:
+ *
+ * Partitioning: Partitions are auto discovered and must be in the form of directories `key=value/`
+ * located at `path`.  Currently only a single partitioning column is supported and it must
+ * be an integer.  This class supports both fully self-describing data, which contains the partition
+ * key, and data where the partition key is only present in the folder structure.  The presence
+ * of the partitioning key in the data is also auto-detected.  The `null` partition is not yet
+ * supported.
+ *
+ * Metadata: The metadata is automatically discovered by reading the first parquet file present.
+ * There is currently no support for working with files that have different schema.  Additionally,
+ * when parquet metadata caching is turned on, the FileStatus objects for all data will be cached
+ * to improve the speed of interactive querying.  When data is added to a table it must be dropped
+ * and recreated to pick up any changes.
+ *
+ * Statistics: Statistics for the size of the table are automatically populated during metadata
+ * discovery.
+ */
+@DeveloperApi
+case class ParquetRelation2(path: String)(@transient val sqlContext: SQLContext)
+  extends CatalystScan with Logging {
+
+  def sparkContext = sqlContext.sparkContext
+
+  // Minor Hack: scala doesnt seem to respect @transient for vals declared via extraction
+  @transient
+  private var partitionKeys: Seq[String] = _
+  @transient
+  private var partitions: Seq[Partition] = _
+  discoverPartitions()
+
+  // TODO: Only finds the first partition, assumes the key is of type Integer...
+  private def discoverPartitions() = {
+    val fs = FileSystem.get(new java.net.URI(path), sparkContext.hadoopConfiguration)
+    val partValue = "([^=]+)=([^=]+)".r
+
+    val childrenOfPath = fs.listStatus(new Path(path)).filterNot(_.getPath.getName.startsWith("_"))
+    val childDirs = childrenOfPath.filter(s => s.isDir)
+
+    if (childDirs.size > 0) {
+      val partitionPairs = childDirs.map(_.getPath.getName).map {
+        case partValue(key, value) => (key, value)
+      }
+
+      val foundKeys = partitionPairs.map(_._1).distinct
+      if (foundKeys.size > 1) {
+        sys.error(s"Too many distinct partition keys: $foundKeys")
+      }
+
+      // Do a parallel lookup of partition metadata.
+      val partitionFiles =
+        childDirs.par.map { d =>
+          fs.listStatus(d.getPath)
+            // TODO: Is there a standard hadoop function for this?
+            .filterNot(_.getPath.getName.startsWith("_"))
+            .filterNot(_.getPath.getName.startsWith("."))
+        }.seq
+
+      partitionKeys = foundKeys.toSeq
+      partitions = partitionFiles.zip(partitionPairs).map { case (files, (key, value)) =>
+        Partition(Map(key -> value.toInt), files)
+      }.toSeq
+    } else {
+      partitionKeys = Nil
+      partitions = Partition(Map.empty, childrenOfPath) :: Nil
+    }
+  }
+
+  override val sizeInBytes = partitions.flatMap(_.files).map(_.getLen).sum
+
+  val dataSchema = StructType.fromAttributes( // TODO: Parquet code should not deal with attributes.
+    ParquetTypesConverter.readSchemaFromFile(
+      partitions.head.files.head.getPath,
+      Some(sparkContext.hadoopConfiguration),
+      sqlContext.isParquetBinaryAsString))
+
+  val dataIncludesKey =
+    partitionKeys.headOption.map(dataSchema.fieldNames.contains(_)).getOrElse(true)
+
+  override val schema =
+    if (dataIncludesKey) {
+      dataSchema
+    } else {
+      StructType(dataSchema.fields :+ StructField(partitionKeys.head, IntegerType))
+    }
+
+  override def buildScan(output: Seq[Attribute], predicates: Seq[Expression]): RDD[Row] = {
+    // This is mostly a hack so that we can use the existing parquet filter code.
+    val requiredColumns = output.map(_.name)
+    // TODO: Parquet filters should be based on data sources API, not catalyst expressions.
+    val filters = DataSourceStrategy.selectFilters(predicates)
+
+    val job = new Job(sparkContext.hadoopConfiguration)
+    ParquetInputFormat.setReadSupportClass(job, classOf[RowReadSupport])
+    val jobConf: Configuration = ContextUtil.getConfiguration(job)
+
+    val requestedSchema = StructType(requiredColumns.map(schema(_)))
+
+    // TODO: Make folder based partitioning a first class citizen of the Data Sources API.
+    val partitionFilters = filters.collect {
+      case e @ EqualTo(attr, value) if partitionKeys.contains(attr) =>
+        logInfo(s"Parquet scan partition filter: $attr=$value")
+        (p: Partition) => p.partitionValues(attr) == value
+
+      case e @ In(attr, values) if partitionKeys.contains(attr) =>
+        logInfo(s"Parquet scan partition filter: $attr IN ${values.mkString("{", ",", "}")}")
+        val set = values.toSet
+        (p: Partition) => set.contains(p.partitionValues(attr))
+
+      case e @ GreaterThan(attr, value) if partitionKeys.contains(attr) =>
+        logInfo(s"Parquet scan partition filter: $attr > $value")
+        (p: Partition) => p.partitionValues(attr).asInstanceOf[Int] > value.asInstanceOf[Int]
+
+      case e @ GreaterThanOrEqual(attr, value) if partitionKeys.contains(attr) =>
+        logInfo(s"Parquet scan partition filter: $attr >= $value")
+        (p: Partition) => p.partitionValues(attr).asInstanceOf[Int] >= value.asInstanceOf[Int]
+
+      case e @ LessThan(attr, value) if partitionKeys.contains(attr) =>
+        logInfo(s"Parquet scan partition filter: $attr < $value")
+        (p: Partition) => p.partitionValues(attr).asInstanceOf[Int] < value.asInstanceOf[Int]
+
+      case e @ LessThanOrEqual(attr, value) if partitionKeys.contains(attr) =>
+        logInfo(s"Parquet scan partition filter: $attr <= $value")
+        (p: Partition) => p.partitionValues(attr).asInstanceOf[Int] <= value.asInstanceOf[Int]
+    }
+
+    val selectedPartitions = partitions.filter(p => partitionFilters.forall(_(p)))
+
+    selectedPartitions.flatMap(_.files).foreach { status =>
+      val path = status.getPath
+      val qualifiedPath = path.getFileSystem(jobConf).makeQualified(path)
+      org.apache.hadoop.mapreduce.lib.input.FileInputFormat.addInputPath(job, qualifiedPath)
+    }
+
+    // Push down filters when possible
+    predicates
+      .reduceOption(And)
+      .flatMap(ParquetFilters.createFilter)
+      .filter(_ => sqlContext.parquetFilterPushDown)
+      .foreach(ParquetInputFormat.setFilterPredicate(jobConf, _))
+
+    def percentRead = selectedPartitions.size.toDouble / partitions.size.toDouble * 100
+    logInfo(s"Reading $percentRead% of $path partitions")
+
+    // Store both requested and original schema in `Configuration`
+    jobConf.set(
+      RowReadSupport.SPARK_ROW_REQUESTED_SCHEMA,
+      ParquetTypesConverter.convertToString(requestedSchema.toAttributes))
+    jobConf.set(
+      RowWriteSupport.SPARK_ROW_SCHEMA,
+      ParquetTypesConverter.convertToString(schema.toAttributes))
+
+    // Tell FilteringParquetRowInputFormat whether it's okay to cache Parquet and FS metadata
+    val useCache = sqlContext.getConf(SQLConf.PARQUET_CACHE_METADATA, "true").toBoolean
+    jobConf.set(SQLConf.PARQUET_CACHE_METADATA, useCache.toString)
+
+    val baseRDD =
+      new org.apache.spark.rdd.NewHadoopRDD(
+        sparkContext,
+        classOf[FilteringParquetRowInputFormat],
+        classOf[Void],
+        classOf[Row],
+        jobConf) {
+        val cacheMetadata = useCache
+
+        @transient
+        val cachedStatus = selectedPartitions.flatMap(_.files)
+
+        // Overridden so we can inject our own cached files statuses.
+        override def getPartitions: Array[SparkPartition] = {
+          val inputFormat =
+            if (cacheMetadata) {
+              new FilteringParquetRowInputFormat {
+                override def listStatus(jobContext: JobContext): JList[FileStatus] = cachedStatus
+              }
+            } else {
+              new FilteringParquetRowInputFormat
+            }
+
+          inputFormat match {
+            case configurable: Configurable =>
+              configurable.setConf(getConf)
+            case _ =>
+          }
+          val jobContext = newJobContext(getConf, jobId)
+          val rawSplits = inputFormat.getSplits(jobContext).toArray
+          val result = new Array[SparkPartition](rawSplits.size)
+          for (i <- 0 until rawSplits.size) {
+            result(i) =
+              new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable])
+          }
+          result
+        }
+      }
+
+    // The ordinal for the partition key in the result row, if requested.
+    val partitionKeyLocation =
+      partitionKeys
+        .headOption
+        .map(requiredColumns.indexOf(_))
+        .getOrElse(-1)
+
+    // When the data does not include the key and the key is requested then we must fill it in
+    // based on information from the input split.
+    if (!dataIncludesKey && partitionKeyLocation != -1) {
+      baseRDD.mapPartitionsWithInputSplit { case (split, iter) =>
+        val partValue = "([^=]+)=([^=]+)".r
+        val partValues =
+          split.asInstanceOf[parquet.hadoop.ParquetInputSplit]
+            .getPath
+            .toString
+            .split("/")
+            .flatMap {
+            case partValue(key, value) => Some(key -> value)
+            case _ => None
+          }.toMap
+
+        val currentValue = partValues.values.head.toInt
+        iter.map { pair =>
+          val res = pair._2.asInstanceOf[SpecificMutableRow]
+          res.setInt(partitionKeyLocation, currentValue)
+          res
+        }
+      }
+    } else {
+      baseRDD.map(_._2)
+    }
+  }
+}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/orc.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/orc.scala
new file mode 100644
index 0000000000000..19e83abbf02bc
--- /dev/null
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/orc.scala
@@ -0,0 +1,245 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.orc
+
+import java.util.{Locale, Properties}
+import scala.collection.JavaConversions._
+
+import org.apache.hadoop.mapreduce.Job
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
+import org.apache.hadoop.hive.serde2.ColumnProjectionUtils
+import org.apache.hadoop.io.{NullWritable, Writable}
+import org.apache.hadoop.mapred.JobConf
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{Path, FileStatus, FileSystem}
+import org.apache.hadoop.hive.ql.io.orc.Reader
+import org.apache.hadoop.hive.ql.io.orc.{OrcFile, OrcInputFormat, OrcSerde}
+import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector
+
+import org.apache.spark.sql.sources.{CatalystScan, BaseRelation, RelationProvider}
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.Logging
+import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.sql.catalyst.expressions.{SpecificMutableRow, Attribute, Expression, Row}
+import org.apache.spark.sql.hive.{HadoopTableReader, HiveMetastoreTypes}
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.parquet.FileSystemHelper
+
+/**
+ * Allows creation of orc based tables using the syntax
+ * `CREATE TEMPORARY TABLE ... USING org.apache.spark.sql.orc`.
+ * Currently the only option required is `path`, which should be the location of a collection of,
+ * optionally partitioned, parquet files.
+ */
+class DefaultSource extends RelationProvider {
+  /** Returns a new base relation with the given parameters. */
+  override def createRelation(
+      sqlContext: SQLContext,
+      parameters: Map[String, String]): BaseRelation = {
+    val path =
+      parameters.getOrElse("path", sys.error("'path' must be specified for orc tables."))
+
+    OrcRelation2(path)(sqlContext)
+  }
+}
+private[orc] case class Partition(partitionValues: Map[String, Any], files: Seq[FileStatus])
+
+/**
+ * Partitioning: Partitions are auto discovered and must be in the form of directories `key=value/`
+ * located at `path`.  Currently only a single partitioning column is supported and it must
+ * be an integer.  This class supports both fully self-describing data, which contains the partition
+ * key, and data where the partition key is only present in the folder structure.  The presence
+ * of the partitioning key in the data is also auto-detected.  The `null` partition is not yet
+ * supported.
+ *
+ * Metadata: The metadata is automatically discovered by reading the first orc file present.
+ * There is currently no support for working with files that have different schema.  Additionally,
+ * when parquet metadata caching is turned on, the FileStatus objects for all data will be cached
+ * to improve the speed of interactive querying.  When data is added to a table it must be dropped
+ * and recreated to pick up any changes.
+ *
+ * Statistics: Statistics for the size of the table are automatically populated during metadata
+ * discovery.
+ */
+@DeveloperApi
+case class OrcRelation2(path: String)(@transient val sqlContext: SQLContext)
+  extends CatalystScan with Logging {
+
+  // used for what?
+  val prop: Properties = new Properties
+
+  @transient
+  lazy val serde: OrcSerde = initSerde
+
+  private def initSerde(): OrcSerde = {
+    val serde = new OrcSerde
+    serde.initialize(null, prop)
+    serde
+  }
+
+  def sparkContext = sqlContext.sparkContext
+
+  // Minor Hack: scala doesnt seem to respect @transient for vals declared via extraction
+  @transient
+  private var partitionKeys: Seq[String] = _
+  @transient
+  private var partitions: Seq[Partition] = _
+  discoverPartitions()
+
+  // TODO: Only finds the first partition, assumes the key is of type Integer...
+  // Why key type is Int?
+  private def discoverPartitions() = {
+    val fs = FileSystem.get(new java.net.URI(path), sparkContext.hadoopConfiguration)
+    val partValue = "([^=]+)=([^=]+)".r
+
+    val childrenOfPath = fs.listStatus(new Path(path))
+      .filterNot(_.getPath.getName.startsWith("_"))
+      .filterNot(_.getPath.getName.startsWith("."))
+
+    val childDirs = childrenOfPath.filter(s => s.isDir)
+
+    if (childDirs.size > 0) {
+      val partitionPairs = childDirs.map(_.getPath.getName).map {
+        case partValue(key, value) => (key, value)
+      }
+
+      val foundKeys = partitionPairs.map(_._1).distinct
+      if (foundKeys.size > 1) {
+        sys.error(s"Too many distinct partition keys: $foundKeys")
+      }
+
+      // Do a parallel lookup of partition metadata.
+      val partitionFiles =
+        childDirs.par.map { d =>
+          fs.listStatus(d.getPath)
+            // TODO: Is there a standard hadoop function for this?
+            .filterNot(_.getPath.getName.startsWith("_"))
+            .filterNot(_.getPath.getName.startsWith("."))
+        }.seq
+
+      partitionKeys = foundKeys.toSeq
+      partitions = partitionFiles.zip(partitionPairs).map { case (files, (key, value)) =>
+        Partition(Map(key -> value.toInt), files)
+      }.toSeq
+    } else {
+      partitionKeys = Nil
+      partitions = Partition(Map.empty, childrenOfPath) :: Nil
+    }
+  }
+
+  override val sizeInBytes = partitions.flatMap(_.files).map(_.getLen).sum
+
+  private def getMetaDataReader(origPath: Path, configuration: Option[Configuration]): Reader = {
+    val conf = configuration.getOrElse(new Configuration())
+    val fs: FileSystem = origPath.getFileSystem(conf)
+    val orcFiles = FileSystemHelper.listFiles(origPath, conf, ".orc")
+    if (orcFiles == Seq.empty) {
+      // should return null when write to orc file
+      return null
+    }
+    OrcFile.createReader(fs, orcFiles(0))
+  }
+
+  private def orcSchema(
+      path: Path,
+      conf: Option[Configuration],
+      prop: Properties): StructType = {
+    // get the schema info through ORC Reader
+    val reader = getMetaDataReader(path, conf)
+    if (reader == null) {
+      // return empty seq when saveAsOrcFile
+      return StructType(Seq.empty)
+    }
+    val inspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector]
+    // data types that is inspected by this inspector
+    val schema = inspector.getTypeName
+    // set prop here, initial OrcSerde need it
+    val fields = inspector.getAllStructFieldRefs
+    val (columns, columnTypes) = fields.map { f =>
+      f.getFieldName -> f.getFieldObjectInspector.getTypeName
+    }.unzip
+    prop.setProperty("columns", columns.mkString(","))
+    prop.setProperty("columns.types", columnTypes.mkString(":"))
+
+    HiveMetastoreTypes.toDataType(schema).asInstanceOf[StructType]
+  }
+
+  val dataSchema = orcSchema(
+      partitions.head.files.head.getPath,
+      Some(sparkContext.hadoopConfiguration),
+      prop)
+
+  val dataIncludesKey =
+    partitionKeys.headOption.map(dataSchema.fieldNames.contains(_)).getOrElse(true)
+
+  override val schema =
+    if (dataIncludesKey) {
+      dataSchema
+    } else {
+      StructType(dataSchema.fields :+ StructField(partitionKeys.head, IntegerType))
+    }
+
+  override def buildScan(output: Seq[Attribute], predicates: Seq[Expression]): RDD[Row] = {
+    val sc = sparkContext
+    val job = new Job(sc.hadoopConfiguration)
+    val conf: Configuration = job.getConfiguration
+    // todo: predicates push down
+    path.split(",").foreach { curPath =>
+      val qualifiedPath = {
+        val path = new Path(curPath)
+        path.getFileSystem(conf).makeQualified(path)
+      }
+      FileInputFormat.addInputPath(job, qualifiedPath)
+    }
+
+    addColumnIds(output, schema.toAttributes, conf)
+    val inputClass = classOf[OrcInputFormat].asInstanceOf[
+      Class[_ <: org.apache.hadoop.mapred.InputFormat[NullWritable, Writable]]]
+
+    // use SpecificMutableRow to decrease GC garbage
+    val mutableRow = new SpecificMutableRow(output.map(_.dataType))
+    val attrsWithIndex = output.zipWithIndex
+    val rowRdd = sc.hadoopRDD[NullWritable, Writable](conf.asInstanceOf[JobConf], inputClass,
+      classOf[NullWritable], classOf[Writable]).map(_._2).mapPartitions { iter =>
+      val deserializer = serde
+      HadoopTableReader.fillObject(iter, deserializer, attrsWithIndex, mutableRow)
+    }
+    rowRdd
+  }
+
+  /**
+   * add column ids and names
+   * @param output
+   * @param relationOutput
+   * @param conf
+   */
+  def addColumnIds(output: Seq[Attribute], relationOutput: Seq[Attribute], conf: Configuration) {
+    val names = output.map(_.name)
+    val fieldIdMap = relationOutput.map(_.name).zipWithIndex.toMap
+    val ids = output.map { att =>
+      val realName = att.name.toLowerCase(Locale.ENGLISH)
+      fieldIdMap.getOrElse(realName, -1)
+    }.filter(_ >= 0).map(_.asInstanceOf[Integer])
+
+    assert(ids.size == names.size, "columns id and name length does not match!")
+    if (ids != null && !ids.isEmpty) {
+      ColumnProjectionUtils.appendReadColumns(conf, ids, names)
+    }
+  }
+}
diff --git a/sql/hive/src/test/resources/data/files/orcfiles/.part-r-1.orc.crc b/sql/hive/src/test/resources/data/files/orcfiles/.part-r-1.orc.crc
new file mode 100644
index 0000000000000000000000000000000000000000..048f7346c2c05e08ebe5a11aed78665f40e3e378
GIT binary patch
literal 20
bcmYc;N@ieSU}BgvvBaS1*w4n6U^Z_6Ku`wu

literal 0
HcmV?d00001

diff --git a/sql/hive/src/test/resources/data/files/orcfiles/.part-r-2.orc.crc b/sql/hive/src/test/resources/data/files/orcfiles/.part-r-2.orc.crc
new file mode 100644
index 0000000000000000000000000000000000000000..52810863f2ae9a45c8f00f8d022cbe68369ea23b
GIT binary patch
literal 20
bcmYc;N@ieSU}BK>5DB}x^|_y9#fBvSHcbY&

literal 0
HcmV?d00001

diff --git a/sql/hive/src/test/resources/data/files/orcfiles/_SUCCESS b/sql/hive/src/test/resources/data/files/orcfiles/_SUCCESS
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/data/files/orcfiles/part-r-1.orc b/sql/hive/src/test/resources/data/files/orcfiles/part-r-1.orc
new file mode 100644
index 0000000000000000000000000000000000000000..3188e14c1fa482780268cdff5acb8ce8c2e5609e
GIT binary patch
literal 1296
zcmeYda^`1X;9?bG=4fElVPJSH^fW1zfl*PekIhZBOwWy3QO(<%K^Le*go~MhfkB8D
zs7{E3gF)aQqeMeJP6;cZQ9+NBk`e@h9%Lj;ne;5BfyvRJQsJJX(4iae85ju3<1`qj
zgeuUD0$eOWH!=X-*TAU7%*E)Sz{nume~^*ck%K8ZD!}1)zE0ddr871gqR#2aF)X?u
z(37yChUH{Zu*u<v*R(}eukwEC<(s?TZS(5gSKod4@``hjm+r|fx7X=2^tPv3%jL%j
zEV~pqCuP%Tw!<r5UaT-O-kr~%k)N|ZLx0QhKXz{x+pI|5aWZ<xq2K#|#J`+uwfeHv
zdSUC$!H1jP@BYBUofEL&;2#EF1_pM<1CdM&oD2*Hq?y4~Jp&Vn2c&K=GKd~A)M#O3
zXnR<1Ai-lU!P6$q!*-aLt+`0A`Cviw!Gs+L6AbPo7)a=OPG91EN%iHV63<;Kv6Ila
zHv8^9=WhDHlr1J?$%(*4ZUz!8iicb}gw!1;iDVk3oSeAaVWLo~R>;c0<&KjT#8`B<
zTypCuvwET8BYyXbz{2TvzdDVM7$_f+OzkNY@-a9m;<;>Q;xv)sGn*Y|xlW5vJIu9u
z&BWBXyk;ji9yPOmax=BRmP@AKgrIAm<-;jL>2r!5XL_&OvFs4<_B(-3=dsJJ{CLDz
z|4QZ4-g2p$4=2Ta_x*fm{*8SGBZKcrb3a)I0oRM+yOt~{aBS3R5pd%0@%uM_&c>6H
z+l;(tt5$M~{1@>le)dxB{`>z2jAcGHC(Atd_bZsR-{NTIxq@lMbDlGa|D5d|^NcHe
z=dsefg2~Z$p7H%JIPma9Qdw@=`Lu7j&km&V<vu%+)|dP2NSa^nvomSua-SVa)60E!
zD(&B`Gy5kxi_PA7Ei&hsTl&VC(Q}Wcy(<9<Tn7p`0tMWG0`q_Zl^_+aK!I?ez&(%v
zNQE;{z<=)1e=>E9&`4xvWN2VuWMXDvWn<^y<l^Sx<>MC+6cQE@6%&_`l#-T_m6KOc
zR8m$^Ra4i{)Y8_`)zddHG%_|ZH8Z!cw6eCbwX=6{baHlab#wRd^z!!c_45x13<?ej
z4GWKmjEatljf+o6OiE5kO-s+n%*xKm&C4$+EGjN3Ei136tg5c5t*d8cVBlS~je)`U
z%t77;0}j>;PSFZ;q=bd{Sgw(_n6Y?AlZfRPhkNIjyLx7b1kY`fRa|D8Th?Z~^o^zH
z?WwGP{`1ByXn)uw|7gRm&yJhLl}f4=%%(_hd#!)uS6fg`!OTr16YVP)86GEy-8igx
z^z=cUg7&~i_N$%dxG^sd;$!a@U{@A4=GK<BZWk#p^bAaktBOj?^6L_DPmAMhIXZJP
zzwi^U(<l8~xQdrQnXdM9iIZxV@u%*DSCh587aX$j{Q1*zWoKo=JRxCrYXM_sLvDf1
zOoq||#*Bu_0>%<122qB^44iTb3y+*TnxSwcB{TCV_Y=;-0t1de$x|{`xIfoo*co(b
z4o`prhZ_G8YY!)h-?AM6jrZC36vDe_c3(W0*d))uv7b=@ScWwSC^0ecG<{)a_78Fv
F0RZVK>0kf=

literal 0
HcmV?d00001

diff --git a/sql/hive/src/test/resources/data/files/orcfiles/part-r-2.orc b/sql/hive/src/test/resources/data/files/orcfiles/part-r-2.orc
new file mode 100644
index 0000000000000000000000000000000000000000..525488226e2a975260e903d3b70e7b39c182cc0c
GIT binary patch
literal 1286
zcma)+do<K}7{<RpL*}9}jr++dnjNQQ#H`({lXKiDlVqgbB>P*9Az3SGt3A%l#59vj
zk!hJBx5%}Kq}j+~jKL7yT(UL{<4$Zba+w_`yMK1}ug`hU^PKm2&ilvfO?A-*0Ih*X
zAX(5?0LW5@f`S2L%DA!3ow#ow-GzWLr4xL80a@kJM<W0Lcx{yxk4Cc8a2p6imQw9A
zlA~%PRTdNk<ETTS@U2ui4aBKyI$5uqymi8NC?W?T@INe{39wSt()&~kyIBi`R6;n&
zP|1RbDg}pDR}}_$q)IYJpdm*Z1oV>Ib~HF@HThRHgeb2E%d7~b?mMC@jZH1_7~5TS
zidGX-w~WJ8t!CVS#p2vtSAl0~aUsWlY9cq!E%V5ei9&ARM?MTU^A8pB>6{btKKMs*
zhF47=oqK9YI5gBOpp*;coRf39d5)%6^-A`-S-4jwNZ5`Cm2%seeh2aRLUtBoCU~B7
z_ZXEzxPLyXbuHeqC?HBosEBDly6AEy*qqYe?o+;aEN^7qqKxXd8wEfE69F)rV5zD7
z%WDwWE{Z#z4FS+?HSqoZ^yAWUhuWH&;M4srWf$k|Y>}}?acrXq_Dy%ApOPwDjDC)d
zHxq;>loSdgI0Cw0);7B1o30oPsk$|YLVDB@WpS^%J?!kg549J12H<?JjR*bu%r{|~
zv%mD6NgrJ5JDanP{dOWn#4$eiiY&>BeDP&b*0~<L@l%n{iOiLOV5Op$SuUK)n2r!m
zrVY#qr?XdgU7ENl*;sD&?Xq8PqEyr^wVa%<TW<8oiCGeV_cmr()YWZp7y_13iVqQJ
zfyRSQo(1f6sP;jZbAhX8fh$_$b@b#**BHN*P92_D9(g$_#NiWnGt3e@hZhYU)(`7v
zipI!{6#;2KS9*GF$>u~jHeI5Sg;ez!?@L}7+2Yg|<y-9SqE*@j?2_f=v;{>A?OSTM
z$lW4yE9lc)=fU{z?g$95SXW}>*jG8+Yw-z*N!M>Ar`$~C-TEypoiE79yq%Sulbd%Z
zzu<0RQE^FW*}ePa6%Q(_9#%i9c`W?>Np0QJ`ac>Pn?%jx7D;PcyR@V8Sy%V-7cXDE
z?&<C8e>3ps;9qZthDT(hWAgEd$*Jj?*}3_53yXih|FE>ISoyfRw!Wd%0APD*Jpf)E
z6fDacsowoUJEc4+zRh*;vY(X<vXARc-Geb0_2Q&GYxi}YcWIiVhex0+u#c}AKZP&o
zQ4ENyOrBnUET6`GWhuXObRmVyNsJLy?zsI!UTAa=jmP9Mvc?T^As~bG`4UsHw2}0V
z<1<1kb53<f<WsSl$v91djt$0Q=K-$n#dC*z=z+ib)5DG==z7otQEah*r>!&UDQ)v+
zqoa#P)6K>ToQVlGi;3_Q&zzFmWPfO7#jYg&GMtUq(L8{&L69-Hhic@VI2(vez}Xmn
zZS7Bv1}KI}Ub9#n9!fK94yT2o^>HI3p)g({Aw7SWgYtER_L>ko+rTyyHWvQqR3G;R
ze_{9uVu!i4)))d68MPLo3*S;`pEuA<sStOFCx)yWwE^<w=6_(ZaC_8%R_uEO!kg-%
F`wyC5PE-H@

literal 0
HcmV?d00001

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/ORCQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/ORCQuerySuite.scala
new file mode 100644
index 0000000000000..320854addf07b
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/ORCQuerySuite.scala
@@ -0,0 +1,244 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.orc
+
+import java.util.Properties
+import java.io.File
+import org.scalatest.BeforeAndAfterAll
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
+import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.hive.test.TestHive
+import org.apache.spark.sql.catalyst.util.getTempFilePath
+import org.apache.spark.sql.hive.test.TestHive._
+import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.util.Utils
+import org.apache.hadoop.hive.ql.io.orc.CompressionKind
+
+case class TestRDDEntry(key: Int, value: String)
+
+case class NullReflectData(
+    intField: java.lang.Integer,
+    longField: java.lang.Long,
+    floatField: java.lang.Float,
+    doubleField: java.lang.Double,
+    booleanField: java.lang.Boolean)
+
+case class OptionalReflectData(
+    intField: Option[Int],
+    longField: Option[Long],
+    floatField: Option[Float],
+    doubleField: Option[Double],
+    booleanField: Option[Boolean])
+
+case class Nested(i: Int, s: String)
+
+case class Data(array: Seq[Int], nested: Nested)
+
+case class Contact(name: String, phone: String)
+
+case class Person(name: String, age: Int, contacts: Seq[Contact])
+
+case class AllDataTypes(
+    stringField: String,
+    intField: Int,
+    longField: Long,
+    floatField: Float,
+    doubleField: Double,
+    shortField: Short,
+    byteField: Byte,
+    booleanField: Boolean)
+
+case class AllDataTypesWithNonPrimitiveType(
+    stringField: String,
+    intField: Int,
+    longField: Long,
+    floatField: Float,
+    doubleField: Double,
+    shortField: Short,
+    byteField: Byte,
+    booleanField: Boolean,
+    array: Seq[Int],
+    arrayContainsNull: Seq[Option[Int]],
+    map: Map[Int, Long],
+    mapValueContainsNull: Map[Int, Option[Long]],
+    data: Data)
+
+case class BinaryData(binaryData: Array[Byte])
+
+class OrcQuerySuite extends QueryTest with BeforeAndAfterAll {
+
+//  test("Read/Write All Types") {
+//    val tempDir = getTempFilePath("orcTest").getCanonicalPath
+//    val range = (0 to 255)
+//    val data = sparkContext
+//      .parallelize(range)
+//      .map(x => AllDataTypes(
+//      s"$x", x, x.toLong, x.toFloat, x.toDouble, x.toShort, x.toByte, x % 2 == 0))
+//
+//    data.saveAsOrcFile(tempDir)
+//    checkAnswer(
+//      TestHive.orcFile(tempDir),
+//      data.toSchemaRDD.collect().toSeq)
+//
+//    Utils.deleteRecursively(new File(tempDir))
+//  }
+//
+//  test("read/write binary data") {
+//    val tempDir = getTempFilePath("orcTest").getCanonicalPath
+//    val range = (0 to 3)
+//    sparkContext.parallelize(range)
+//    .map(x => BinaryData(s"test$x".getBytes("utf8"))).saveAsOrcFile(tempDir)
+//
+//    TestHive.orcFile(tempDir)
+//      .map(r => new String(r(0).asInstanceOf[Array[Byte]], "utf8"))
+//      .collect().toSet == Set("test0", "test1", "test2", "test3")
+//    Utils.deleteRecursively(new File(tempDir))
+//  }
+//
+//  test("read/right empty RDD") {
+//    val tempDir = getTempFilePath("orcTest").getCanonicalPath
+//    val range = (0 to 255)
+//    val data = sparkContext
+//      .parallelize(range)
+//      .map(x => AllDataTypes(
+//      s"$x", x, x.toLong, x.toFloat, x.toDouble, x.toShort, x.toByte, x % 2 == 0))
+//      .filter(_.intField > 500)
+//    data.saveAsOrcFile(tempDir)
+//    checkAnswer(
+//      TestHive.orcFile(tempDir),
+//      data.toSchemaRDD.collect().toSeq)
+//
+//    Utils.deleteRecursively(new File(tempDir))
+//  }
+//
+//  test("Read/Write All Types with non-primitive type") {
+//    val tempDir = getTempFilePath("orcTest").getCanonicalPath
+//    val range = (0 to 255)
+//    val data = sparkContext.parallelize(range)
+//      .map(x => AllDataTypesWithNonPrimitiveType(
+//      s"$x", x, x.toLong, x.toFloat, x.toDouble, x.toShort, x.toByte, x % 2 == 0,
+//      (0 until x),
+//      (0 until x).map(Option(_).filter(_ % 3 == 0)),
+//      (0 until x).map(i => i -> i.toLong).toMap,
+//      (0 until x).map(i => i -> Option(i.toLong)).toMap + (x -> None),
+//      Data((0 until x), Nested(x, s"$x"))))
+//    data.saveAsOrcFile(tempDir)
+//
+//    checkAnswer(
+//      TestHive.orcFile(tempDir),
+//      data.toSchemaRDD.collect().toSeq)
+//    Utils.deleteRecursively(new File(tempDir))
+//  }
+//
+//  test("Creating case class RDD table") {
+//    sparkContext.parallelize((1 to 100))
+//      .map(i => TestRDDEntry(i, s"val_$i"))
+//      .registerTempTable("tmp")
+//    val rdd = sql("SELECT * FROM tmp").collect().sortBy(_.getInt(0))
+//    var counter = 1
+//    rdd.foreach {
+//      // '===' does not like string comparison?
+//      row => {
+//        assert(row.getString(1).equals(s"val_$counter"), s"row $counter value ${row.getString(1)} does not match val_$counter")
+//        counter = counter + 1
+//      }
+//    }
+//  }
+//
+//  test("Simple selection form orc table") {
+//    val tempDir = getTempFilePath("orcTest").getCanonicalPath
+//    val data = sparkContext.parallelize((1 to 10))
+//      .map(i => Person(s"name_$i", i,  (0 until 2).map{ m=>
+//      Contact(s"contact_$m", s"phone_$m") }))
+//    data.saveAsOrcFile(tempDir)
+//    val f = TestHive.orcFile(tempDir)
+//    f.registerTempTable("tmp")
+//    var rdd = sql("SELECT name FROM tmp where age <= 5")
+//    assert(rdd.count() == 5)
+//
+//    rdd = sql("SELECT name, contacts FROM tmp where age > 5")
+//    assert(rdd.count() == 5)
+//
+//    val contacts = rdd.flatMap(t=>t(1).asInstanceOf[Seq[_]])
+//    assert(contacts.count() == 10)
+//    Utils.deleteRecursively(new File(tempDir))
+//  }
+//
+//  test("save and load case class RDD with Nones as orc") {
+//    val data = OptionalReflectData(None, None, None, None, None)
+//    val rdd = sparkContext.parallelize(data :: data :: data :: Nil)
+//    val tempDir = getTempFilePath("orcTest").getCanonicalPath
+//    rdd.saveAsOrcFile(tempDir)
+//    val readFile = TestHive.orcFile(tempDir)
+//    val rdd_saved = readFile.collect()
+//    assert(rdd_saved(0) === Seq.fill(5)(null))
+//    Utils.deleteRecursively(new File(tempDir))
+//  }
+//
+//  test("Compression options for writing to a Orcfile") {
+//    val tempDir = getTempFilePath("orcTest").getCanonicalPath
+//    val rdd = TestHive.sparkContext.parallelize((1 to 100))
+//      .map(i => TestRDDEntry(i, s"val_$i"))
+//
+//    // test default compression codec, now only support zlib
+//    rdd.saveAsOrcFile(tempDir)
+//    val actualCodec = OrcFileOperator.getMetaDataReader(new Path(tempDir), Some(new Configuration())).getCompression.name
+//    assert(actualCodec == "ZLIB")
+//
+//    Utils.deleteRecursively(new File(tempDir))
+//  }
+//
+//  test("Get ORC Schema with ORC Reader") {
+//    val path = "src/test/resources/data/files/orcfiles"
+//    val attributes = OrcFileOperator.orcSchema(path, Some(TestHive.sparkContext.hadoopConfiguration), new Properties())
+//    assert(attributes(0).dataType == StringType)
+//    assert(attributes(1).dataType == IntegerType)
+//    assert(attributes(2).dataType == LongType)
+//    assert(attributes(3).dataType == FloatType)
+//    assert(attributes(4).dataType == DoubleType)
+//    assert(attributes(5).dataType == ShortType)
+//    assert(attributes(6).dataType == ByteType)
+//    assert(attributes(7).dataType == BooleanType)
+//  }
+//
+//  ignore("Other Compression options for writing to an Orcfile only supported in hive 0.13.1 and above") {
+//    TestHive.sparkContext.hadoopConfiguration.set(orcDefaultCompressVar, "SNAPPY")
+//    var tempDir = getTempFilePath("orcTest").getCanonicalPath
+//    val rdd = sparkContext.parallelize((1 to 100))
+//      .map(i => TestRDDEntry(i, s"val_$i"))
+//    rdd.saveAsOrcFile(tempDir)
+//    var actualCodec = OrcFileOperator.getMetaDataReader(new Path(tempDir), Some(new Configuration())).getCompression
+//    assert(actualCodec == CompressionKind.SNAPPY)
+//    Utils.deleteRecursively(new File(tempDir))
+//
+//    TestHive.sparkContext.hadoopConfiguration.set(orcDefaultCompressVar, "NONE")
+//    tempDir = getTempFilePath("orcTest").getCanonicalPath
+//    rdd.saveAsOrcFile(tempDir)
+//    actualCodec = OrcFileOperator.getMetaDataReader(new Path(tempDir), Some(new Configuration())).getCompression
+//    assert(actualCodec == CompressionKind.NONE)
+//    Utils.deleteRecursively(new File(tempDir))
+//
+//    TestHive.sparkContext.hadoopConfiguration.set(orcDefaultCompressVar, "LZO")
+//    tempDir = getTempFilePath("orcTest").getCanonicalPath
+//    rdd.saveAsOrcFile(tempDir)
+//    actualCodec = OrcFileOperator.getMetaDataReader(new Path(tempDir), Some(new Configuration())).getCompression
+//    assert(actualCodec == CompressionKind.LZO)
+//    Utils.deleteRecursively(new File(tempDir))
+//  }
+}

From c90ed2f6c9996dd665aca21b072b68cd3a638dc1 Mon Sep 17 00:00:00 2001
From: w00228970 <wangfei1@huawei.com>
Date: Thu, 20 Nov 2014 14:09:24 +0800
Subject: [PATCH 29/32] test case

---
 .../spark/sql/hive/orc/ORCQuerySuite.scala    | 488 +++++++++---------
 .../spark/sql/hive/orc/OrcSourceSuite.scala   | 144 ++++++
 2 files changed, 388 insertions(+), 244 deletions(-)
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/ORCQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/ORCQuerySuite.scala
index 320854addf07b..684252c1960a8 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/ORCQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/ORCQuerySuite.scala
@@ -1,244 +1,244 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.hive.orc
-
-import java.util.Properties
-import java.io.File
-import org.scalatest.BeforeAndAfterAll
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.Path
-import org.apache.spark.sql.QueryTest
-import org.apache.spark.sql.hive.test.TestHive
-import org.apache.spark.sql.catalyst.util.getTempFilePath
-import org.apache.spark.sql.hive.test.TestHive._
-import org.apache.spark.sql.catalyst.types._
-import org.apache.spark.util.Utils
-import org.apache.hadoop.hive.ql.io.orc.CompressionKind
-
-case class TestRDDEntry(key: Int, value: String)
-
-case class NullReflectData(
-    intField: java.lang.Integer,
-    longField: java.lang.Long,
-    floatField: java.lang.Float,
-    doubleField: java.lang.Double,
-    booleanField: java.lang.Boolean)
-
-case class OptionalReflectData(
-    intField: Option[Int],
-    longField: Option[Long],
-    floatField: Option[Float],
-    doubleField: Option[Double],
-    booleanField: Option[Boolean])
-
-case class Nested(i: Int, s: String)
-
-case class Data(array: Seq[Int], nested: Nested)
-
-case class Contact(name: String, phone: String)
-
-case class Person(name: String, age: Int, contacts: Seq[Contact])
-
-case class AllDataTypes(
-    stringField: String,
-    intField: Int,
-    longField: Long,
-    floatField: Float,
-    doubleField: Double,
-    shortField: Short,
-    byteField: Byte,
-    booleanField: Boolean)
-
-case class AllDataTypesWithNonPrimitiveType(
-    stringField: String,
-    intField: Int,
-    longField: Long,
-    floatField: Float,
-    doubleField: Double,
-    shortField: Short,
-    byteField: Byte,
-    booleanField: Boolean,
-    array: Seq[Int],
-    arrayContainsNull: Seq[Option[Int]],
-    map: Map[Int, Long],
-    mapValueContainsNull: Map[Int, Option[Long]],
-    data: Data)
-
-case class BinaryData(binaryData: Array[Byte])
-
-class OrcQuerySuite extends QueryTest with BeforeAndAfterAll {
-
-//  test("Read/Write All Types") {
-//    val tempDir = getTempFilePath("orcTest").getCanonicalPath
-//    val range = (0 to 255)
-//    val data = sparkContext
-//      .parallelize(range)
-//      .map(x => AllDataTypes(
-//      s"$x", x, x.toLong, x.toFloat, x.toDouble, x.toShort, x.toByte, x % 2 == 0))
-//
-//    data.saveAsOrcFile(tempDir)
-//    checkAnswer(
-//      TestHive.orcFile(tempDir),
-//      data.toSchemaRDD.collect().toSeq)
-//
-//    Utils.deleteRecursively(new File(tempDir))
-//  }
-//
-//  test("read/write binary data") {
-//    val tempDir = getTempFilePath("orcTest").getCanonicalPath
-//    val range = (0 to 3)
-//    sparkContext.parallelize(range)
-//    .map(x => BinaryData(s"test$x".getBytes("utf8"))).saveAsOrcFile(tempDir)
-//
-//    TestHive.orcFile(tempDir)
-//      .map(r => new String(r(0).asInstanceOf[Array[Byte]], "utf8"))
-//      .collect().toSet == Set("test0", "test1", "test2", "test3")
-//    Utils.deleteRecursively(new File(tempDir))
-//  }
-//
-//  test("read/right empty RDD") {
-//    val tempDir = getTempFilePath("orcTest").getCanonicalPath
-//    val range = (0 to 255)
-//    val data = sparkContext
-//      .parallelize(range)
-//      .map(x => AllDataTypes(
-//      s"$x", x, x.toLong, x.toFloat, x.toDouble, x.toShort, x.toByte, x % 2 == 0))
-//      .filter(_.intField > 500)
-//    data.saveAsOrcFile(tempDir)
-//    checkAnswer(
-//      TestHive.orcFile(tempDir),
-//      data.toSchemaRDD.collect().toSeq)
-//
-//    Utils.deleteRecursively(new File(tempDir))
-//  }
-//
-//  test("Read/Write All Types with non-primitive type") {
-//    val tempDir = getTempFilePath("orcTest").getCanonicalPath
-//    val range = (0 to 255)
-//    val data = sparkContext.parallelize(range)
-//      .map(x => AllDataTypesWithNonPrimitiveType(
-//      s"$x", x, x.toLong, x.toFloat, x.toDouble, x.toShort, x.toByte, x % 2 == 0,
-//      (0 until x),
-//      (0 until x).map(Option(_).filter(_ % 3 == 0)),
-//      (0 until x).map(i => i -> i.toLong).toMap,
-//      (0 until x).map(i => i -> Option(i.toLong)).toMap + (x -> None),
-//      Data((0 until x), Nested(x, s"$x"))))
-//    data.saveAsOrcFile(tempDir)
-//
-//    checkAnswer(
-//      TestHive.orcFile(tempDir),
-//      data.toSchemaRDD.collect().toSeq)
-//    Utils.deleteRecursively(new File(tempDir))
-//  }
-//
-//  test("Creating case class RDD table") {
-//    sparkContext.parallelize((1 to 100))
-//      .map(i => TestRDDEntry(i, s"val_$i"))
-//      .registerTempTable("tmp")
-//    val rdd = sql("SELECT * FROM tmp").collect().sortBy(_.getInt(0))
-//    var counter = 1
-//    rdd.foreach {
-//      // '===' does not like string comparison?
-//      row => {
-//        assert(row.getString(1).equals(s"val_$counter"), s"row $counter value ${row.getString(1)} does not match val_$counter")
-//        counter = counter + 1
-//      }
-//    }
-//  }
-//
-//  test("Simple selection form orc table") {
-//    val tempDir = getTempFilePath("orcTest").getCanonicalPath
-//    val data = sparkContext.parallelize((1 to 10))
-//      .map(i => Person(s"name_$i", i,  (0 until 2).map{ m=>
-//      Contact(s"contact_$m", s"phone_$m") }))
-//    data.saveAsOrcFile(tempDir)
-//    val f = TestHive.orcFile(tempDir)
-//    f.registerTempTable("tmp")
-//    var rdd = sql("SELECT name FROM tmp where age <= 5")
-//    assert(rdd.count() == 5)
-//
-//    rdd = sql("SELECT name, contacts FROM tmp where age > 5")
-//    assert(rdd.count() == 5)
-//
-//    val contacts = rdd.flatMap(t=>t(1).asInstanceOf[Seq[_]])
-//    assert(contacts.count() == 10)
-//    Utils.deleteRecursively(new File(tempDir))
-//  }
-//
-//  test("save and load case class RDD with Nones as orc") {
-//    val data = OptionalReflectData(None, None, None, None, None)
-//    val rdd = sparkContext.parallelize(data :: data :: data :: Nil)
-//    val tempDir = getTempFilePath("orcTest").getCanonicalPath
-//    rdd.saveAsOrcFile(tempDir)
-//    val readFile = TestHive.orcFile(tempDir)
-//    val rdd_saved = readFile.collect()
-//    assert(rdd_saved(0) === Seq.fill(5)(null))
-//    Utils.deleteRecursively(new File(tempDir))
-//  }
-//
-//  test("Compression options for writing to a Orcfile") {
-//    val tempDir = getTempFilePath("orcTest").getCanonicalPath
-//    val rdd = TestHive.sparkContext.parallelize((1 to 100))
-//      .map(i => TestRDDEntry(i, s"val_$i"))
-//
-//    // test default compression codec, now only support zlib
-//    rdd.saveAsOrcFile(tempDir)
-//    val actualCodec = OrcFileOperator.getMetaDataReader(new Path(tempDir), Some(new Configuration())).getCompression.name
-//    assert(actualCodec == "ZLIB")
-//
-//    Utils.deleteRecursively(new File(tempDir))
-//  }
-//
-//  test("Get ORC Schema with ORC Reader") {
-//    val path = "src/test/resources/data/files/orcfiles"
-//    val attributes = OrcFileOperator.orcSchema(path, Some(TestHive.sparkContext.hadoopConfiguration), new Properties())
-//    assert(attributes(0).dataType == StringType)
-//    assert(attributes(1).dataType == IntegerType)
-//    assert(attributes(2).dataType == LongType)
-//    assert(attributes(3).dataType == FloatType)
-//    assert(attributes(4).dataType == DoubleType)
-//    assert(attributes(5).dataType == ShortType)
-//    assert(attributes(6).dataType == ByteType)
-//    assert(attributes(7).dataType == BooleanType)
-//  }
-//
-//  ignore("Other Compression options for writing to an Orcfile only supported in hive 0.13.1 and above") {
-//    TestHive.sparkContext.hadoopConfiguration.set(orcDefaultCompressVar, "SNAPPY")
-//    var tempDir = getTempFilePath("orcTest").getCanonicalPath
-//    val rdd = sparkContext.parallelize((1 to 100))
-//      .map(i => TestRDDEntry(i, s"val_$i"))
-//    rdd.saveAsOrcFile(tempDir)
-//    var actualCodec = OrcFileOperator.getMetaDataReader(new Path(tempDir), Some(new Configuration())).getCompression
-//    assert(actualCodec == CompressionKind.SNAPPY)
-//    Utils.deleteRecursively(new File(tempDir))
-//
-//    TestHive.sparkContext.hadoopConfiguration.set(orcDefaultCompressVar, "NONE")
-//    tempDir = getTempFilePath("orcTest").getCanonicalPath
-//    rdd.saveAsOrcFile(tempDir)
-//    actualCodec = OrcFileOperator.getMetaDataReader(new Path(tempDir), Some(new Configuration())).getCompression
-//    assert(actualCodec == CompressionKind.NONE)
-//    Utils.deleteRecursively(new File(tempDir))
-//
-//    TestHive.sparkContext.hadoopConfiguration.set(orcDefaultCompressVar, "LZO")
-//    tempDir = getTempFilePath("orcTest").getCanonicalPath
-//    rdd.saveAsOrcFile(tempDir)
-//    actualCodec = OrcFileOperator.getMetaDataReader(new Path(tempDir), Some(new Configuration())).getCompression
-//    assert(actualCodec == CompressionKind.LZO)
-//    Utils.deleteRecursively(new File(tempDir))
-//  }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.orc
+
+import java.util.Properties
+import java.io.File
+import org.scalatest.BeforeAndAfterAll
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
+import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.hive.test.TestHive
+import org.apache.spark.sql.catalyst.util.getTempFilePath
+import org.apache.spark.sql.hive.test.TestHive._
+import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.util.Utils
+import org.apache.hadoop.hive.ql.io.orc.CompressionKind
+
+case class TestRDDEntry(key: Int, value: String)
+
+case class NullReflectData(
+    intField: java.lang.Integer,
+    longField: java.lang.Long,
+    floatField: java.lang.Float,
+    doubleField: java.lang.Double,
+    booleanField: java.lang.Boolean)
+
+case class OptionalReflectData(
+    intField: Option[Int],
+    longField: Option[Long],
+    floatField: Option[Float],
+    doubleField: Option[Double],
+    booleanField: Option[Boolean])
+
+case class Nested(i: Int, s: String)
+
+case class Data(array: Seq[Int], nested: Nested)
+
+case class Contact(name: String, phone: String)
+
+case class Person(name: String, age: Int, contacts: Seq[Contact])
+
+case class AllDataTypes(
+    stringField: String,
+    intField: Int,
+    longField: Long,
+    floatField: Float,
+    doubleField: Double,
+    shortField: Short,
+    byteField: Byte,
+    booleanField: Boolean)
+
+case class AllDataTypesWithNonPrimitiveType(
+    stringField: String,
+    intField: Int,
+    longField: Long,
+    floatField: Float,
+    doubleField: Double,
+    shortField: Short,
+    byteField: Byte,
+    booleanField: Boolean,
+    array: Seq[Int],
+    arrayContainsNull: Seq[Option[Int]],
+    map: Map[Int, Long],
+    mapValueContainsNull: Map[Int, Option[Long]],
+    data: Data)
+
+case class BinaryData(binaryData: Array[Byte])
+
+class OrcQuerySuite extends QueryTest with BeforeAndAfterAll {
+
+  test("Read/Write All Types") {
+    val tempDir = getTempFilePath("orcTest").getCanonicalPath
+    val range = (0 to 255)
+    val data = sparkContext
+      .parallelize(range)
+      .map(x => AllDataTypes(
+      s"$x", x, x.toLong, x.toFloat, x.toDouble, x.toShort, x.toByte, x % 2 == 0))
+
+    data.saveAsOrcFile(tempDir)
+    checkAnswer(
+      TestHive.orcFile(tempDir),
+      data.toSchemaRDD.collect().toSeq)
+
+    Utils.deleteRecursively(new File(tempDir))
+  }
+
+  test("read/write binary data") {
+    val tempDir = getTempFilePath("orcTest").getCanonicalPath
+    val range = (0 to 3)
+    sparkContext.parallelize(range)
+      .map(x => BinaryData(s"test$x".getBytes("utf8"))).saveAsOrcFile(tempDir)
+
+    TestHive.orcFile(tempDir)
+      .map(r => new String(r(0).asInstanceOf[Array[Byte]], "utf8"))
+      .collect().toSet == Set("test0", "test1", "test2", "test3")
+    Utils.deleteRecursively(new File(tempDir))
+  }
+
+  test("read/right empty RDD") {
+    val tempDir = getTempFilePath("orcTest").getCanonicalPath
+    val range = (0 to 255)
+    val data = sparkContext
+      .parallelize(range)
+      .map(x => AllDataTypes(
+      s"$x", x, x.toLong, x.toFloat, x.toDouble, x.toShort, x.toByte, x % 2 == 0))
+      .filter(_.intField > 500)
+    data.saveAsOrcFile(tempDir)
+    checkAnswer(
+      TestHive.orcFile(tempDir),
+      data.toSchemaRDD.collect().toSeq)
+
+    Utils.deleteRecursively(new File(tempDir))
+  }
+
+  test("Read/Write All Types with non-primitive type") {
+    val tempDir = getTempFilePath("orcTest").getCanonicalPath
+    val range = (0 to 255)
+    val data = sparkContext.parallelize(range)
+      .map(x => AllDataTypesWithNonPrimitiveType(
+      s"$x", x, x.toLong, x.toFloat, x.toDouble, x.toShort, x.toByte, x % 2 == 0,
+      (0 until x),
+      (0 until x).map(Option(_).filter(_ % 3 == 0)),
+      (0 until x).map(i => i -> i.toLong).toMap,
+      (0 until x).map(i => i -> Option(i.toLong)).toMap + (x -> None),
+      Data((0 until x), Nested(x, s"$x"))))
+    data.saveAsOrcFile(tempDir)
+
+    checkAnswer(
+      TestHive.orcFile(tempDir),
+      data.toSchemaRDD.collect().toSeq)
+    Utils.deleteRecursively(new File(tempDir))
+  }
+
+  test("Creating case class RDD table") {
+    sparkContext.parallelize((1 to 100))
+      .map(i => TestRDDEntry(i, s"val_$i"))
+      .registerTempTable("tmp")
+    val rdd = sql("SELECT * FROM tmp").collect().sortBy(_.getInt(0))
+    var counter = 1
+    rdd.foreach {
+      // '===' does not like string comparison?
+      row => {
+        assert(row.getString(1).equals(s"val_$counter"), s"row $counter value ${row.getString(1)} does not match val_$counter")
+        counter = counter + 1
+      }
+    }
+  }
+
+  test("Simple selection form orc table") {
+    val tempDir = getTempFilePath("orcTest").getCanonicalPath
+    val data = sparkContext.parallelize((1 to 10))
+      .map(i => Person(s"name_$i", i,  (0 until 2).map{ m=>
+      Contact(s"contact_$m", s"phone_$m") }))
+    data.saveAsOrcFile(tempDir)
+    val f = TestHive.orcFile(tempDir)
+    f.registerTempTable("tmp")
+    var rdd = sql("SELECT name FROM tmp where age <= 5")
+    assert(rdd.count() == 5)
+
+    rdd = sql("SELECT name, contacts FROM tmp where age > 5")
+    assert(rdd.count() == 5)
+
+    val contacts = rdd.flatMap(t=>t(1).asInstanceOf[Seq[_]])
+    assert(contacts.count() == 10)
+    Utils.deleteRecursively(new File(tempDir))
+  }
+
+  test("save and load case class RDD with Nones as orc") {
+    val data = OptionalReflectData(None, None, None, None, None)
+    val rdd = sparkContext.parallelize(data :: data :: data :: Nil)
+    val tempDir = getTempFilePath("orcTest").getCanonicalPath
+    rdd.saveAsOrcFile(tempDir)
+    val readFile = TestHive.orcFile(tempDir)
+    val rdd_saved = readFile.collect()
+    assert(rdd_saved(0) === Seq.fill(5)(null))
+    Utils.deleteRecursively(new File(tempDir))
+  }
+
+  test("Compression options for writing to a Orcfile") {
+    val tempDir = getTempFilePath("orcTest").getCanonicalPath
+    val rdd = TestHive.sparkContext.parallelize((1 to 100))
+      .map(i => TestRDDEntry(i, s"val_$i"))
+
+    // test default compression codec, now only support zlib
+    rdd.saveAsOrcFile(tempDir)
+    val actualCodec = OrcFileOperator.getMetaDataReader(new Path(tempDir), Some(new Configuration())).getCompression.name
+    assert(actualCodec == "ZLIB")
+
+    Utils.deleteRecursively(new File(tempDir))
+  }
+
+  test("Get ORC Schema with ORC Reader") {
+    val path = "src/test/resources/data/files/orcfiles"
+    val attributes = OrcFileOperator.orcSchema(path, Some(TestHive.sparkContext.hadoopConfiguration), new Properties())
+    assert(attributes(0).dataType == StringType)
+    assert(attributes(1).dataType == IntegerType)
+    assert(attributes(2).dataType == LongType)
+    assert(attributes(3).dataType == FloatType)
+    assert(attributes(4).dataType == DoubleType)
+    assert(attributes(5).dataType == ShortType)
+    assert(attributes(6).dataType == ByteType)
+    assert(attributes(7).dataType == BooleanType)
+  }
+
+  ignore("Other Compression options for writing to an Orcfile only supported in hive 0.13.1 and above") {
+    TestHive.sparkContext.hadoopConfiguration.set(orcDefaultCompressVar, "SNAPPY")
+    var tempDir = getTempFilePath("orcTest").getCanonicalPath
+    val rdd = sparkContext.parallelize((1 to 100))
+      .map(i => TestRDDEntry(i, s"val_$i"))
+    rdd.saveAsOrcFile(tempDir)
+    var actualCodec = OrcFileOperator.getMetaDataReader(new Path(tempDir), Some(new Configuration())).getCompression
+    assert(actualCodec == CompressionKind.SNAPPY)
+    Utils.deleteRecursively(new File(tempDir))
+
+    TestHive.sparkContext.hadoopConfiguration.set(orcDefaultCompressVar, "NONE")
+    tempDir = getTempFilePath("orcTest").getCanonicalPath
+    rdd.saveAsOrcFile(tempDir)
+    actualCodec = OrcFileOperator.getMetaDataReader(new Path(tempDir), Some(new Configuration())).getCompression
+    assert(actualCodec == CompressionKind.NONE)
+    Utils.deleteRecursively(new File(tempDir))
+
+    TestHive.sparkContext.hadoopConfiguration.set(orcDefaultCompressVar, "LZO")
+    tempDir = getTempFilePath("orcTest").getCanonicalPath
+    rdd.saveAsOrcFile(tempDir)
+    actualCodec = OrcFileOperator.getMetaDataReader(new Path(tempDir), Some(new Configuration())).getCompression
+    assert(actualCodec == CompressionKind.LZO)
+    Utils.deleteRecursively(new File(tempDir))
+  }
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala
new file mode 100644
index 0000000000000..e864a9643a3fd
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.orc
+
+import org.apache.spark.sql.QueryTest
+import org.scalatest.BeforeAndAfterAll
+import java.io.File
+import org.apache.spark.sql.hive.test.TestHive._
+import org.apache.spark.sql.hive.orc.OrcData
+import org.apache.spark.sql.hive.orc.OrcDataWithKey
+
+// The data where the partitioning key exists only in the directory structure.
+case class OrcData(intField: Int, stringField: String)
+// The data that also includes the partitioning key
+case class OrcDataWithKey(p: Int, intField: Int, stringField: String)
+
+/**
+ * A collection of tests for parquet data with various forms of partitioning.
+ */
+abstract class OrcTest extends QueryTest with BeforeAndAfterAll {
+  var partitionedTableDir: File = null
+  var partitionedTableDirWithKey: File = null
+
+  override def beforeAll(): Unit = {
+    partitionedTableDir = File.createTempFile("orctests", "sparksql")
+    partitionedTableDir.delete()
+    partitionedTableDir.mkdir()
+
+    (1 to 10).foreach { p =>
+      val partDir = new File(partitionedTableDir, s"p=$p")
+      sparkContext.makeRDD(1 to 10)
+        .map(i => OrcData(i, s"part-$p"))
+        .saveAsOrcFile(partDir.getCanonicalPath)
+    }
+
+    partitionedTableDirWithKey = File.createTempFile("parquettests", "sparksql")
+    partitionedTableDirWithKey.delete()
+    partitionedTableDirWithKey.mkdir()
+
+    (1 to 10).foreach { p =>
+      val partDir = new File(partitionedTableDirWithKey, s"p=$p")
+      sparkContext.makeRDD(1 to 10)
+        .map(i => OrcDataWithKey(p, i, s"part-$p"))
+        .saveAsOrcFile(partDir.getCanonicalPath)
+    }
+  }
+
+  Seq("partitioned_parquet", "partitioned_parquet_with_key").foreach { table =>
+    test(s"project the partitioning column $table") {
+      checkAnswer(
+        sql(s"SELECT p, count(*) FROM $table group by p"),
+        (1, 10) ::
+          (2, 10) ::
+          (3, 10) ::
+          (4, 10) ::
+          (5, 10) ::
+          (6, 10) ::
+          (7, 10) ::
+          (8, 10) ::
+          (9, 10) ::
+          (10, 10) :: Nil
+      )
+    }
+
+    test(s"project partitioning and non-partitioning columns $table") {
+      checkAnswer(
+        sql(s"SELECT stringField, p, count(intField) FROM $table GROUP BY p, stringField"),
+        ("part-1", 1, 10) ::
+          ("part-2", 2, 10) ::
+          ("part-3", 3, 10) ::
+          ("part-4", 4, 10) ::
+          ("part-5", 5, 10) ::
+          ("part-6", 6, 10) ::
+          ("part-7", 7, 10) ::
+          ("part-8", 8, 10) ::
+          ("part-9", 9, 10) ::
+          ("part-10", 10, 10) :: Nil
+      )
+    }
+
+    test(s"simple count $table") {
+      checkAnswer(
+        sql(s"SELECT COUNT(*) FROM $table"),
+        100)
+    }
+
+    test(s"pruned count $table") {
+      checkAnswer(
+        sql(s"SELECT COUNT(*) FROM $table WHERE p = 1"),
+        10)
+    }
+
+    test(s"multi-partition pruned count $table") {
+      checkAnswer(
+        sql(s"SELECT COUNT(*) FROM $table WHERE p IN (1,2,3)"),
+        30)
+    }
+
+    test(s"non-partition predicates $table") {
+      checkAnswer(
+        sql(s"SELECT COUNT(*) FROM $table WHERE intField IN (1,2,3)"),
+        30)
+    }
+
+    test(s"sum $table") {
+      checkAnswer(
+        sql(s"SELECT SUM(intField) FROM $table WHERE intField IN (1,2,3) AND p = 1"),
+        1 + 2 + 3)
+    }
+
+    test(s"hive udfs $table") {
+      checkAnswer(
+        sql(s"SELECT concat(stringField, stringField) FROM $table"),
+        sql(s"SELECT stringField FROM $table").map {
+          case Row(s: String) => Row(s + s)
+        }.collect().toSeq)
+    }
+  }
+
+  test("non-part select(*)") {
+    checkAnswer(
+      sql("SELECT COUNT(*) FROM normal_parquet"),
+      10)
+  }
+}
+
+class OrcSourceSuite {
+
+}

From 3ca68eb728c4cbfe18cde1ebc8c7b8aa00a06604 Mon Sep 17 00:00:00 2001
From: w00228970 <wangfei1@huawei.com>
Date: Thu, 20 Nov 2014 15:50:33 +0800
Subject: [PATCH 30/32] no used bin file

---
 .../data/files/orcfiles/.part-r-1.orc.crc          | Bin 20 -> 0 bytes
 .../data/files/orcfiles/.part-r-2.orc.crc          | Bin 20 -> 0 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 sql/hive/src/test/resources/data/files/orcfiles/.part-r-1.orc.crc
 delete mode 100644 sql/hive/src/test/resources/data/files/orcfiles/.part-r-2.orc.crc

diff --git a/sql/hive/src/test/resources/data/files/orcfiles/.part-r-1.orc.crc b/sql/hive/src/test/resources/data/files/orcfiles/.part-r-1.orc.crc
deleted file mode 100644
index 048f7346c2c05e08ebe5a11aed78665f40e3e378..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 20
bcmYc;N@ieSU}BgvvBaS1*w4n6U^Z_6Ku`wu

diff --git a/sql/hive/src/test/resources/data/files/orcfiles/.part-r-2.orc.crc b/sql/hive/src/test/resources/data/files/orcfiles/.part-r-2.orc.crc
deleted file mode 100644
index 52810863f2ae9a45c8f00f8d022cbe68369ea23b..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 20
bcmYc;N@ieSU}BK>5DB}x^|_y9#fBvSHcbY&


From 1e0c1d9073e2f5b6261d1c8f85e043e94f084f96 Mon Sep 17 00:00:00 2001
From: scwf <wangfei1@huawei.com>
Date: Sat, 22 Nov 2014 23:00:54 +0800
Subject: [PATCH 31/32] Revert "https -> http in pom"

This reverts commit 5f5fda8244c59970d117a0e9d32bd8bddde348f2.
---
 pom.xml | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/pom.xml b/pom.xml
index a32c55cfb2c8f..6c1c1214a7d3e 100644
--- a/pom.xml
+++ b/pom.xml
@@ -156,7 +156,7 @@
       <id>central</id>
       <!-- This should be at top, it makes maven try the central repo first and then others and hence faster dep resolution -->
       <name>Maven Repository</name>
-      <url>http://repo1.maven.org/maven2</url>
+      <url>https://repo1.maven.org/maven2</url>
       <releases>
         <enabled>true</enabled>
       </releases>
@@ -167,7 +167,7 @@
     <repository>
       <id>apache-repo</id>
       <name>Apache Repository</name>
-      <url>http://repository.apache.org/content/repositories/releases</url>
+      <url>https://repository.apache.org/content/repositories/releases</url>
       <releases>
         <enabled>true</enabled>
       </releases>
@@ -178,7 +178,7 @@
     <repository>
       <id>jboss-repo</id>
       <name>JBoss Repository</name>
-      <url>http://repository.jboss.org/nexus/content/repositories/releases</url>
+      <url>https://repository.jboss.org/nexus/content/repositories/releases</url>
       <releases>
         <enabled>true</enabled>
       </releases>
@@ -189,7 +189,7 @@
     <repository>
       <id>mqtt-repo</id>
       <name>MQTT Repository</name>
-      <url>http://repo.eclipse.org/content/repositories/paho-releases</url>
+      <url>https://repo.eclipse.org/content/repositories/paho-releases</url>
       <releases>
         <enabled>true</enabled>
       </releases>
@@ -200,7 +200,7 @@
     <repository>
       <id>cloudera-repo</id>
       <name>Cloudera Repository</name>
-      <url>http://repository.cloudera.com/artifactory/cloudera-repos</url>
+      <url>https://repository.cloudera.com/artifactory/cloudera-repos</url>
       <releases>
         <enabled>true</enabled>
       </releases>
@@ -222,7 +222,7 @@
     <repository>
       <id>spring-releases</id>
       <name>Spring Release Repository</name>
-      <url>http://repo.spring.io/libs-release</url>
+      <url>https://repo.spring.io/libs-release</url>
       <releases>
         <enabled>true</enabled>
       </releases>
@@ -234,7 +234,7 @@
       <!-- This is temporarily included to fix issues with Hive 0.12 -->
       <id>spark-staging</id>
       <name>Spring Staging Repository</name>
-      <url>http://oss.sonatype.org/content/repositories/orgspark-project-1085</url>
+      <url>https://oss.sonatype.org/content/repositories/orgspark-project-1085</url>
       <releases>
         <enabled>true</enabled>
       </releases>
@@ -246,7 +246,7 @@
       <!-- This is temporarily included to fix issues with Hive 0.13 -->
       <id>spark-staging-hive13</id>
       <name>Spring Staging Repository Hive 13</name>
-      <url>http://oss.sonatype.org/content/repositories/orgspark-project-1089/</url>
+      <url>https://oss.sonatype.org/content/repositories/orgspark-project-1089/</url>
       <releases>
         <enabled>true</enabled>
       </releases>
@@ -270,7 +270,7 @@
   <pluginRepositories>
     <pluginRepository>
       <id>central</id>
-      <url>http://repo1.maven.org/maven2</url>
+      <url>https://repo1.maven.org/maven2</url>
       <releases>
         <enabled>true</enabled>
       </releases>

From c5236efce8397f7c0a1bd436121aa6c13f8511b6 Mon Sep 17 00:00:00 2001
From: Zhan Zhang <zhazhan@gmail.com>
Date: Mon, 1 Dec 2014 13:35:24 -0800
Subject: [PATCH 32/32] ppd support

---
 .../spark/sql/hive/HiveStrategies.scala       |  36 ++++-
 .../spark/sql/hive/orc/OrcFilters.scala       | 125 ++++++++++++++++++
 .../spark/sql/hive/orc/OrcRelation.scala      |   1 +
 .../sql/hive/orc/OrcTableOperations.scala     |  19 ++-
 .../apache/spark/sql/hive/orc/package.scala   |  12 +-
 5 files changed, 178 insertions(+), 15 deletions(-)
 create mode 100644 sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
index 4f114eebd2999..90473a5dc9f13 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
@@ -17,7 +17,9 @@
 
 package org.apache.spark.sql.hive
 
+import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.hive.ql.parse.ASTNode
+import org.apache.hadoop.mapreduce.Job
 
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
@@ -30,7 +32,7 @@ import org.apache.spark.sql.catalyst.types.StringType
 import org.apache.spark.sql.execution.{DescribeCommand, OutputFaker, SparkPlan}
 import org.apache.spark.sql.hive
 import org.apache.spark.sql.hive.execution._
-import org.apache.spark.sql.hive.orc.{WriteToOrcFile, InsertIntoOrcTable, OrcRelation, OrcTableScan}
+import org.apache.spark.sql.hive.orc._
 import org.apache.spark.sql.parquet.ParquetRelation
 import org.apache.spark.sql.{SQLContext, SchemaRDD, Strategy}
 
@@ -243,8 +245,36 @@ private[hive] trait HiveStrategies {
       case logical.InsertIntoTable(table: OrcRelation, partition, child, overwrite) =>
         InsertIntoOrcTable(table, planLater(child), overwrite) :: Nil
       case PhysicalOperation(projectList, filters, relation: OrcRelation) =>
-        // TODO: need to implement predict push down.
-        val prunePushedDownFilters = identity[Seq[Expression]] _
+        val prunePushedDownFilters = {
+          OrcRelation.jobConf =  sparkContext.hadoopConfiguration
+          if (ORC_FILTER_PUSHDOWN_ENABLED) {
+            val job = new Job(OrcRelation.jobConf)
+            val conf: Configuration = job.getConfiguration
+            logInfo("Orc push down filter enabled:" + filters)
+            (filters: Seq[Expression]) => {
+              val recordFilter = OrcFilters.createFilter(filters)
+              if (recordFilter.isDefined) {
+
+                logInfo("Parsed filters:" + recordFilter)
+                /**
+                 * To test it, we can set follows so that the reader
+                 * will not read whole file if small
+                 * sparkContext.hadoopConfiguration.setInt(
+                 * "mapreduce.input.fileinputformat.split.maxsize", 50)
+                 */
+                conf.set(SARG_PUSHDOWN, toKryo(recordFilter.get))
+                conf.setBoolean("hive.optimize.index.filter", true)
+                OrcRelation.jobConf = conf
+              }
+              // no matter whether it is filtered or not in orc,
+              // we need to do more fine grained filter
+              // in the upper layer, return all of them
+              filters
+            }
+          } else {
+            identity[Seq[Expression]] _
+          }
+        }
         pruneFilterProject(
           projectList,
           filters,
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala
new file mode 100644
index 0000000000000..faaa00c59a022
--- /dev/null
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.orc
+
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.hadoop.hive.ql.io.sarg.SearchArgument
+import org.apache.hadoop.hive.ql.io.sarg.SearchArgument.Builder
+import org.apache.spark.Logging
+
+private[sql] object OrcFilters extends Logging {
+
+  def createFilter(expr: Seq[Expression]): Option[SearchArgument] = {
+    if (expr == null || expr.size == 0) return None
+    var sarg: Option[Builder] = Some(SearchArgument.FACTORY.newBuilder())
+    sarg.get.startAnd()
+    expr.foreach {
+      x => {
+        sarg match {
+          case Some(s1) => sarg = createFilter(x, s1)
+          case _ => None
+        }
+      }
+    }
+    sarg match {
+      case Some(b) => Some(b.end.build)
+      case _ => None
+    }
+  }
+
+  def createFilter(expression: Expression, builder: Builder): Option[Builder] = {
+    expression match {
+      case p@And(left: Expression, right: Expression) => {
+        val b1 = builder.startAnd()
+        val b2 = createFilter(left, b1)
+        b2 match {
+          case Some(b) => val b3 = createFilter(right, b)
+            if (b3.isDefined) {
+              Some(b3.get.end)
+            } else {
+              None
+            }
+          case _ => None
+        }
+      }
+      case p@Or(left: Expression, right: Expression) => {
+        val b1 = builder.startOr()
+        val b2 = createFilter(left, b1)
+        b2 match {
+          case Some(b) => val b3 = createFilter(right, b)
+            if (b3.isDefined) {
+              Some(b3.get.end)
+            } else {
+              None
+            }
+          case _ => None
+        }
+      }
+      case p@EqualTo(left: Literal, right: NamedExpression) => {
+        val b1 = builder.equals(right.name, left.value)
+        Some(b1)
+      }
+      case p@EqualTo(left: NamedExpression, right: Literal) => {
+        val b1 = builder.equals(left.name, right.value)
+        Some(b1)
+      }
+      case p@LessThan(left: NamedExpression, right: Literal) => {
+        val b1 = builder.lessThan(left.name, right.value)
+        Some(b1)
+      }
+      case p@LessThan(left: Literal, right: NamedExpression) => {
+        val b1 = builder.startNot().lessThanEquals(right.name, left.value).end()
+        Some(b1)
+      }
+      case p@LessThanOrEqual(left: NamedExpression, right: Literal) => {
+        val b1 = builder.lessThanEquals(left.name, right.value)
+        Some(b1)
+      }
+      case p@LessThanOrEqual(left: Literal, right: NamedExpression) => {
+        val b1 = builder.startNot().lessThan(right.name, left.value).end()
+        Some(b1)
+      }
+      case p@GreaterThan(left: NamedExpression, right: Literal) => {
+        val b1 = builder.startNot().lessThanEquals(left.name, right.value).end()
+        Some(b1)
+      }
+      case p@GreaterThan(left: Literal, right: NamedExpression) => {
+        val b1 = builder.lessThanEquals(right.name, left.value)
+        Some(b1)
+      }
+      case p@GreaterThanOrEqual(left: NamedExpression, right: Literal) => {
+        val b1 = builder.startNot().lessThan(left.name, right.value).end()
+        Some(b1)
+      }
+      case p@GreaterThanOrEqual(left: Literal, right: NamedExpression) => {
+        val b1 = builder.lessThan(right.name, left.value)
+        Some(b1)
+      }
+      // TODO: test it
+      case p@EqualNullSafe(left: NamedExpression, right: NamedExpression) => {
+        val b1 = builder.nullSafeEquals(left.name, right.name)
+        Some(b1)
+      }
+      case p@In(left: NamedExpression, list: Seq[Literal]) => {
+        val b1 = builder.in(left.name, list.map(_.value).toArray)
+        Some(b1)
+      }
+      case _ => None
+    }
+  }
+}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
index fbafae5181723..73ad0a1cf7ae3 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
@@ -94,6 +94,7 @@ private[sql] object OrcRelation {
     }
     path
   }
+  var jobConf: Configuration = _
 }
 
 private[sql] object OrcFileOperator{
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala
index 34a26f4b301c2..314b45b4a14ca 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcTableOperations.scala
@@ -77,8 +77,7 @@ case class OrcTableScan(
 
   override def execute(): RDD[Row] = {
     val sc = sqlContext.sparkContext
-    val job = new Job(sc.hadoopConfiguration)
-
+    val job = new Job(OrcRelation.jobConf) // sc.hadoopConfiguration)
     val conf: Configuration = job.getConfiguration
     relation.path.split(",").foreach { curPath =>
       val qualifiedPath = {
@@ -110,17 +109,15 @@ case class OrcTableScan(
    * @param conf
    */
   def addColumnIds(output: Seq[Attribute], relation: OrcRelation, conf: Configuration) {
-    val fieldIdMap = relation.output.map(_.name).zipWithIndex.toMap
-    val names = output.map(_.name)
-    val ids = output.map { att =>
-      val realName = att.name.toLowerCase(Locale.ENGLISH)
-      fieldIdMap.getOrElse(realName, -1)
-    }.filter(_ >= 0).map(_.asInstanceOf[Integer])
 
+    val ids =
+      output.map(a =>
+        relation.output.indexWhere(_.name == a.name): Integer)
+        .filter(_ >= 0)
+    val names = output.map(_.name)
     assert(ids.size == names.size, "columns id and name length does not match!")
-    if (ids != null && !ids.isEmpty) {
-      HiveShim.appendReadColumns(conf, ids, names)
-    }
+    val sorted = ids.zip(names).sorted
+    HiveShim.appendReadColumns(conf, sorted.map(_._1), sorted.map(_._2))
   }
 }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/package.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/package.scala
index 6fd07ece972dc..ee48b37ff4e42 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/package.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/package.scala
@@ -17,6 +17,9 @@
 
 package org.apache.spark.sql.hive
 
+import com.esotericsoftware.kryo.Kryo
+import com.esotericsoftware.kryo.io.Output
+import org.apache.commons.codec.binary.Base64
 import org.apache.spark.sql.{SQLContext, SchemaRDD}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.hadoop.hive.serde2.objectinspector._
@@ -46,6 +49,13 @@ package object orc {
   // for orc compression type, only take effect in hive 0.13.1
   val orcDefaultCompressVar = "hive.exec.orc.default.compress"
   // for prediction push down in hive-0.13.1, don't enable it
-  val ORC_FILTER_PUSHDOWN_ENABLED = false
+  val ORC_FILTER_PUSHDOWN_ENABLED = true
   val SARG_PUSHDOWN = "sarg.pushdown"
+
+  def toKryo(input: Any) = {
+    val out = new Output(4 * 1024, 10 * 1024 * 1024);
+    new Kryo().writeObject(out, input);
+    out.close();
+    Base64.encodeBase64String(out.toBytes());
+  }
 }

Property Name	Default	Meaning
`spark.sql.parquet.binaryAsString`	false	+ Some other Parquet-producing systems, in particular Impala and older versions of Spark SQL, do + not differentiate between binary data and strings when writing out the Parquet schema. This + flag tells Spark SQL to interpret binary data as a string to provide compatibility with these systems. +
`spark.sql.parquet.cacheMetadata`	false	+ Turns on caching of Parquet schema metadata. Can speed up querying of static data. +
`spark.sql.parquet.compression.codec`	snappy	+ Sets the compression codec use when writing Parquet files. Acceptable values include: + uncompressed, snappy, gzip, lzo. +
Property Name	Default	Meaning
`spark.sql.parquet.binaryAsString`	false	- Some other Parquet-producing systems, in particular Impala and older versions of Spark SQL, do - not differentiate between binary data and strings when writing out the Parquet schema. This - flag tells Spark SQL to interpret binary data as a string to provide compatibility with these systems. -
`spark.sql.parquet.cacheMetadata`	false	- Turns on caching of Parquet schema metadata. Can speed up querying of static data. -
`spark.sql.parquet.compression.codec`	snappy	- Sets the compression codec use when writing Parquet files. Acceptable values include: - uncompressed, snappy, gzip, lzo. -