Skip to content
Permalink
Browse files

[SPARK-27402][SQL][TEST-HADOOP3.2][TEST-MAVEN] Fix hadoop-3.2 test is…

…sue(except the hive-thriftserver module)

## What changes were proposed in this pull request?

This pr fix hadoop-3.2 test issues(except the `hive-thriftserver` module):
1. Add `hive.metastore.schema.verification` and `datanucleus.schema.autoCreateAll` to HiveConf.
2. hadoop-3.2 support access the Hive metastore from 0.12 to 2.2

After [SPARK-27176](https://issues.apache.org/jira/browse/SPARK-27176) and this PR, we upgraded the built-in Hive to 2.3 when enabling the Hadoop 3.2+ profile. This upgrade fixes the following issues:
- [HIVE-6727](https://issues.apache.org/jira/browse/HIVE-6727): Table level stats for external tables are set incorrectly.
- [HIVE-15653](https://issues.apache.org/jira/browse/HIVE-15653): Some ALTER TABLE commands drop table stats.
- [SPARK-12014](https://issues.apache.org/jira/browse/SPARK-12014): Spark SQL query containing semicolon is broken in Beeline.
- [SPARK-25193](https://issues.apache.org/jira/browse/SPARK-25193): insert overwrite doesn't throw exception when drop old data fails.
- [SPARK-25919](https://issues.apache.org/jira/browse/SPARK-25919): Date value corrupts when tables are "ParquetHiveSerDe" formatted and target table is Partitioned.
- [SPARK-26332](https://issues.apache.org/jira/browse/SPARK-26332): Spark sql write orc table on viewFS throws exception.
- [SPARK-26437](https://issues.apache.org/jira/browse/SPARK-26437): Decimal data becomes bigint to query, unable to query.

## How was this patch tested?
This pr test Spark’s Hadoop 3.2 profile on jenkins and #24591 test Spark’s Hadoop 2.7 profile on jenkins

This PR close #24591

Closes #24391 from wangyum/SPARK-27402.

Authored-by: Yuming Wang <yumwang@ebay.com>
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
  • Loading branch information...
wangyum authored and gatorsmile committed May 13, 2019
1 parent d169b0a commit f3ddd6f9da27925607c06e55cdfb9a809633238b
@@ -15,9 +15,17 @@
# limitations under the License.
#

from __future__ import print_function
from functools import total_ordering
import itertools
import re
import os

if os.environ.get("AMPLAB_JENKINS"):
hadoop_version = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE", "hadoop2.7")
else:
hadoop_version = os.environ.get("HADOOP_PROFILE", "hadoop2.7")
print("[info] Choosing supported modules with Hadoop profile", hadoop_version)

all_modules = []

@@ -72,7 +80,11 @@ def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=
self.dependent_modules = set()
for dep in dependencies:
dep.dependent_modules.add(self)
all_modules.append(self)
# TODO: Skip hive-thriftserver module for hadoop-3.2. remove this once hadoop-3.2 support it
if name == "hive-thriftserver" and hadoop_version == "hadoop3.2":
print("[info] Skip unsupported module:", name)
else:
all_modules.append(self)

def contains_file(self, filename):
return any(re.match(p, filename) for p in self.source_file_prefixes)
@@ -1371,7 +1371,7 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
// if (isUsingHiveMetastore) {
// assert(storageFormat.properties.get("path") === expected)
// }
assert(storageFormat.locationUri === Some(expected))
assert(storageFormat.locationUri.map(_.getPath) === Some(expected.getPath))
}
// set table location
sql("ALTER TABLE dbx.tab1 SET LOCATION '/path/to/your/lovely/heart'")
@@ -118,7 +118,7 @@ abstract class OrcSuite extends OrcTest with BeforeAndAfterAll {
}
}

protected def testSelectiveDictionaryEncoding(isSelective: Boolean) {
protected def testSelectiveDictionaryEncoding(isSelective: Boolean, isHive23: Boolean = false) {
val tableName = "orcTable"

withTempDir { dir =>
@@ -171,7 +171,7 @@ abstract class OrcSuite extends OrcTest with BeforeAndAfterAll {
// Hive 0.11 and RLE v2 is introduced in Hive 0.12 ORC with more improvements.
// For more details, see https://orc.apache.org/specification/
assert(stripe.getColumns(1).getKind === DICTIONARY_V2)
if (isSelective) {
if (isSelective || isHive23) {
assert(stripe.getColumns(2).getKind === DIRECT_V2)
} else {
assert(stripe.getColumns(2).getKind === DICTIONARY_V2)
@@ -52,6 +52,7 @@ import org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, ParseException}
import org.apache.spark.sql.execution.QueryExecutionException
import org.apache.spark.sql.execution.command.DDLUtils
import org.apache.spark.sql.hive.HiveExternalCatalog.{DATASOURCE_SCHEMA, DATASOURCE_SCHEMA_NUMPARTS, DATASOURCE_SCHEMA_PART_PREFIX}
import org.apache.spark.sql.hive.HiveUtils
import org.apache.spark.sql.hive.client.HiveClientImpl._
import org.apache.spark.sql.types._
import org.apache.spark.util.{CircularBuffer, Utils}
@@ -191,7 +192,29 @@ private[hive] class HiveClientImpl(
}

/** Returns the configuration for the current session. */
def conf: HiveConf = state.getConf
def conf: HiveConf = if (!HiveUtils.isHive23) {
state.getConf
} else {
val hiveConf = state.getConf
// Hive changed the default of datanucleus.schema.autoCreateAll from true to false
// and hive.metastore.schema.verification from false to true since Hive 2.0.
// For details, see the JIRA HIVE-6113, HIVE-12463 and HIVE-1841.
// isEmbeddedMetaStore should not be true in the production environment.
// We hard-code hive.metastore.schema.verification and datanucleus.schema.autoCreateAll to allow
// bin/spark-shell, bin/spark-sql and sbin/start-thriftserver.sh to automatically create the
// Derby Metastore when running Spark in the non-production environment.
val isEmbeddedMetaStore = {
val msUri = hiveConf.getVar(ConfVars.METASTOREURIS)
val msConnUrl = hiveConf.getVar(ConfVars.METASTORECONNECTURLKEY)
(msUri == null || msUri.trim().isEmpty) &&
(msConnUrl != null && msConnUrl.startsWith("jdbc:derby"))
}
if (isEmbeddedMetaStore) {
hiveConf.setBoolean("hive.metastore.schema.verification", false)
hiveConf.setBoolean("datanucleus.schema.autoCreateAll", true)
}
hiveConf
}

private val userName = conf.getUser

@@ -28,6 +28,7 @@ import org.apache.commons.io.{FileUtils, IOUtils}
import org.apache.commons.lang3.{JavaVersion, SystemUtils}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hive.conf.HiveConf.ConfVars
import org.apache.hadoop.hive.shims.ShimLoader

import org.apache.spark.SparkConf
import org.apache.spark.deploy.SparkSubmitUtils
@@ -196,6 +197,7 @@ private[hive] class IsolatedClientLoader(
protected def isBarrierClass(name: String): Boolean =
name.startsWith(classOf[HiveClientImpl].getName) ||
name.startsWith(classOf[Shim].getName) ||
name.startsWith(classOf[ShimLoader].getName) ||
barrierPrefixes.exists(name.startsWith)

protected def classToPath(name: String): String =
@@ -63,6 +63,9 @@ object TestHive
// SPARK-8910
.set(UI_ENABLED, false)
.set(config.UNSAFE_EXCEPTION_ON_MEMORY_LEAK, true)
// Hive changed the default of hive.metastore.disallow.incompatible.col.type.changes
// from false to true. For details, see the JIRA HIVE-12320 and HIVE-17764.
.set("spark.hadoop.hive.metastore.disallow.incompatible.col.type.changes", "false")
// Disable ConvertToLocalRelation for better test coverage. Test cases built on
// LocalRelation will exercise the optimization rules better by disabling it as
// this rule may potentially block testing of other optimization rules such as
@@ -120,8 +123,10 @@ class TestHiveContext(
@transient override val sparkSession: TestHiveSparkSession)
extends SQLContext(sparkSession) {

val HIVE_CONTRIB_JAR: String = "hive-contrib-0.13.1.jar"
val HIVE_HCATALOG_CORE_JAR: String = "hive-hcatalog-core-0.13.1.jar"
val HIVE_CONTRIB_JAR: String =
if (HiveUtils.isHive23) "hive-contrib-2.3.4.jar" else "hive-contrib-0.13.1.jar"
val HIVE_HCATALOG_CORE_JAR: String =
if (HiveUtils.isHive23) "hive-hcatalog-core-2.3.4.jar" else "hive-hcatalog-core-0.13.1.jar"

/**
* If loadTestTables is false, no test tables are loaded. Note that this flag can only be true
Binary file not shown.
Binary file not shown.
@@ -58,11 +58,19 @@ class ClasspathDependenciesSuite extends SparkFunSuite {
}

test("shaded Protobuf") {
assertLoads("org.apache.hive.com.google.protobuf.ServiceException")
if (HiveUtils.isHive23) {
assertLoads("com.google.protobuf.ServiceException")
} else {
assertLoads("org.apache.hive.com.google.protobuf.ServiceException")
}
}

test("shaded Kryo") {
assertLoads("org.apache.hive.com.esotericsoftware.kryo.Kryo")
if (HiveUtils.isHive23) {
assertLoads("com.esotericsoftware.kryo.Kryo")
} else {
assertLoads("org.apache.hive.com.esotericsoftware.kryo.Kryo")
}
}

test("hive-common") {
@@ -81,7 +89,12 @@ class ClasspathDependenciesSuite extends SparkFunSuite {
}

test("parquet-hadoop-bundle") {
assertLoads("parquet.hadoop.ParquetOutputFormat")
assertLoads("parquet.hadoop.ParquetInputFormat")
if (HiveUtils.isHive23) {
assertLoads("org.apache.parquet.hadoop.ParquetOutputFormat")
assertLoads("org.apache.parquet.hadoop.ParquetInputFormat")
} else {
assertLoads("parquet.hadoop.ParquetOutputFormat")
assertLoads("parquet.hadoop.ParquetInputFormat")
}
}
}
@@ -186,6 +186,8 @@ class HiveExternalCatalogVersionsSuite extends SparkSubmitTestUtils {
"--master", "local[2]",
"--conf", "spark.ui.enabled=false",
"--conf", "spark.master.rest.enabled=false",
"--conf", "spark.sql.hive.metastore.version=1.2.1",
"--conf", "spark.sql.hive.metastore.jars=maven",
"--conf", s"spark.sql.warehouse.dir=${wareHousePath.getCanonicalPath}",
"--conf", s"spark.sql.test.version.index=$index",
"--driver-java-options", s"-Dderby.system.home=${wareHousePath.getCanonicalPath}",
@@ -203,6 +205,8 @@ class HiveExternalCatalogVersionsSuite extends SparkSubmitTestUtils {
"--master", "local[2]",
"--conf", "spark.ui.enabled=false",
"--conf", "spark.master.rest.enabled=false",
"--conf", "spark.sql.hive.metastore.version=1.2.1",
"--conf", "spark.sql.hive.metastore.jars=maven",
"--conf", s"spark.sql.warehouse.dir=${wareHousePath.getCanonicalPath}",
"--driver-java-options", s"-Dderby.system.home=${wareHousePath.getCanonicalPath}",
unusedJar.toString)
@@ -206,7 +206,13 @@ class DataSourceWithHiveMetastoreCatalogSuite
assert(columns.map(_.dataType) === Seq(DecimalType(10, 3), StringType))

checkAnswer(table("t"), testDF)
assert(sparkSession.metadataHive.runSqlHive("SELECT * FROM t") === Seq("1.1\t1", "2.1\t2"))
if (HiveUtils.isHive23) {
assert(sparkSession.metadataHive.runSqlHive("SELECT * FROM t") ===
Seq("1.100\t1", "2.100\t2"))
} else {
assert(sparkSession.metadataHive.runSqlHive("SELECT * FROM t") ===
Seq("1.1\t1", "2.1\t2"))
}
}
}

@@ -238,8 +244,13 @@ class DataSourceWithHiveMetastoreCatalogSuite
assert(columns.map(_.dataType) === Seq(DecimalType(10, 3), StringType))

checkAnswer(table("t"), testDF)
assert(sparkSession.metadataHive.runSqlHive("SELECT * FROM t") ===
Seq("1.1\t1", "2.1\t2"))
if (HiveUtils.isHive23) {
assert(sparkSession.metadataHive.runSqlHive("SELECT * FROM t") ===
Seq("1.100\t1", "2.100\t2"))
} else {
assert(sparkSession.metadataHive.runSqlHive("SELECT * FROM t") ===
Seq("1.1\t1", "2.1\t2"))
}
}
}
}
@@ -35,10 +35,18 @@ class HiveShimSuite extends SparkFunSuite {

// test when READ_COLUMN_NAMES_CONF_STR is empty
HiveShim.appendReadColumns(conf, ids, names)
assert(names.asJava === ColumnProjectionUtils.getReadColumnNames(conf))
if (HiveUtils.isHive23) {
assert(names === ColumnProjectionUtils.getReadColumnNames(conf))
} else {
assert(names.asJava === ColumnProjectionUtils.getReadColumnNames(conf))
}

// test when READ_COLUMN_NAMES_CONF_STR is non-empty
HiveShim.appendReadColumns(conf, moreIds, moreNames)
assert((names ++ moreNames).asJava === ColumnProjectionUtils.getReadColumnNames(conf))
if (HiveUtils.isHive23) {
assert((names ++ moreNames) === ColumnProjectionUtils.getReadColumnNames(conf))
} else {
assert((names ++ moreNames).asJava === ColumnProjectionUtils.getReadColumnNames(conf))
}
}
}
@@ -100,8 +100,14 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
.asInstanceOf[HiveTableRelation]

val properties = relation.tableMeta.ignoredProperties
assert(properties("totalSize").toLong <= 0, "external table totalSize must be <= 0")
assert(properties("rawDataSize").toLong <= 0, "external table rawDataSize must be <= 0")
if (HiveUtils.isHive23) {
// Since HIVE-6727, Hive fixes table-level stats for external tables are incorrect.
assert(properties("totalSize").toLong == 6)
assert(properties.get("rawDataSize").isEmpty)
} else {
assert(properties("totalSize").toLong <= 0, "external table totalSize must be <= 0")
assert(properties("rawDataSize").toLong <= 0, "external table rawDataSize must be <= 0")
}

val sizeInBytes = relation.stats.sizeInBytes
assert(sizeInBytes === BigInt(file1.length() + file2.length()))
@@ -865,17 +871,25 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
val totalSize = extractStatsPropValues(describeResult, "totalSize")
assert(totalSize.isDefined && totalSize.get > 0, "totalSize is lost")

// ALTER TABLE SET/UNSET TBLPROPERTIES invalidates some Hive specific statistics, but not
// Spark specific statistics. This is triggered by the Hive alterTable API.
val numRows = extractStatsPropValues(describeResult, "numRows")
assert(numRows.isDefined && numRows.get == -1, "numRows is lost")
val rawDataSize = extractStatsPropValues(describeResult, "rawDataSize")
assert(rawDataSize.isDefined && rawDataSize.get == -1, "rawDataSize is lost")

if (analyzedBySpark) {
if (HiveUtils.isHive23) {
// Since HIVE-15653(Hive 2.3.0), Hive fixs some ALTER TABLE commands drop table stats.
assert(numRows.isDefined && numRows.get == 500)
val rawDataSize = extractStatsPropValues(describeResult, "rawDataSize")
assert(rawDataSize.isDefined && rawDataSize.get == 5312)
checkTableStats(tabName, hasSizeInBytes = true, expectedRowCounts = Some(500))
} else {
checkTableStats(tabName, hasSizeInBytes = true, expectedRowCounts = None)
// ALTER TABLE SET/UNSET TBLPROPERTIES invalidates some Hive specific statistics, but not
// Spark specific statistics. This is triggered by the Hive alterTable API.
assert(numRows.isDefined && numRows.get == -1, "numRows is lost")
val rawDataSize = extractStatsPropValues(describeResult, "rawDataSize")
assert(rawDataSize.isDefined && rawDataSize.get == -1, "rawDataSize is lost")

if (analyzedBySpark) {
checkTableStats(tabName, hasSizeInBytes = true, expectedRowCounts = Some(500))
} else {
checkTableStats(tabName, hasSizeInBytes = true, expectedRowCounts = None)
}
}
}
}

0 comments on commit f3ddd6f

Please sign in to comment.
You can’t perform that action at this time.