Skip to content

Commit

Permalink
[SQL] Add test case with workaround for reading partitioned Avro files
Browse files Browse the repository at this point in the history
In order to read from partitioned Avro files we need to also set the `SERDEPROPERTIES` since `TBLPROPERTIES` are not passed to the initialization.  This PR simply adds a test to make sure we don't break this workaround.

Author: Michael Armbrust <michael@databricks.com>

Closes #2340 from marmbrus/avroPartitioned and squashes the following commits:

6b969d6 [Michael Armbrust] fix style
fea2124 [Michael Armbrust] Add test case with workaround for reading partitioned avro files.
  • Loading branch information
marmbrus committed Sep 11, 2014
1 parent 79cdb9b commit 84e2c8b
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 1 deletion.
69 changes: 68 additions & 1 deletion sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,74 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) {
|)
""".stripMargin.cmd,
s"LOAD DATA LOCAL INPATH '${getHiveFile("data/files/episodes.avro")}' INTO TABLE episodes".cmd
)
),
// THIS TABLE IS NOT THE SAME AS THE HIVE TEST TABLE episodes_partitioned AS DYNAMIC PARITIONING
// IS NOT YET SUPPORTED
TestTable("episodes_part",
s"""CREATE TABLE episodes_part (title STRING, air_date STRING, doctor INT)
|PARTITIONED BY (doctor_pt INT)
|ROW FORMAT SERDE '${classOf[AvroSerDe].getCanonicalName}'
|STORED AS
|INPUTFORMAT '${classOf[AvroContainerInputFormat].getCanonicalName}'
|OUTPUTFORMAT '${classOf[AvroContainerOutputFormat].getCanonicalName}'
|TBLPROPERTIES (
| 'avro.schema.literal'='{
| "type": "record",
| "name": "episodes",
| "namespace": "testing.hive.avro.serde",
| "fields": [
| {
| "name": "title",
| "type": "string",
| "doc": "episode title"
| },
| {
| "name": "air_date",
| "type": "string",
| "doc": "initial date"
| },
| {
| "name": "doctor",
| "type": "int",
| "doc": "main actor playing the Doctor in episode"
| }
| ]
| }'
|)
""".stripMargin.cmd,
// WORKAROUND: Required to pass schema to SerDe for partitioned tables.
// TODO: Pass this automatically from the table to partitions.
s"""
|ALTER TABLE episodes_part SET SERDEPROPERTIES (
| 'avro.schema.literal'='{
| "type": "record",
| "name": "episodes",
| "namespace": "testing.hive.avro.serde",
| "fields": [
| {
| "name": "title",
| "type": "string",
| "doc": "episode title"
| },
| {
| "name": "air_date",
| "type": "string",
| "doc": "initial date"
| },
| {
| "name": "doctor",
| "type": "int",
| "doc": "main actor playing the Doctor in episode"
| }
| ]
| }'
|)
""".stripMargin.cmd,
s"""
INSERT OVERWRITE TABLE episodes_part PARTITION (doctor_pt=1)
SELECT title, air_date, doctor FROM episodes
""".cmd
)
)

hiveQTestUtilTables.foreach(registerTestTable)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
The Eleventh Hour 3 April 2010 11 1
The Doctor's Wife 14 May 2011 11 1
Horror of Fang Rock 3 September 1977 4 1
An Unearthly Child 23 November 1963 1 1
The Mysterious Planet 6 September 1986 6 1
Rose 26 March 2005 9 1
The Power of the Daleks 5 November 1966 2 1
Castrolava 4 January 1982 5 1
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,6 @@ class HiveSerDeSuite extends HiveComparisonTest with BeforeAndAfterAll {
createQueryTest("Read with RegexSerDe", "SELECT * FROM sales")

createQueryTest("Read with AvroSerDe", "SELECT * FROM episodes")

createQueryTest("Read Partitioned with AvroSerDe", "SELECT * FROM episodes_part")
}

0 comments on commit 84e2c8b

Please sign in to comment.