-
Notifications
You must be signed in to change notification settings - Fork 29.1k
[SPARK-29987][SQL] Add CatalogTable cache in SessionCatalog to improve performance #26627
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
83b3c64
9d88db8
83edf2a
ba900c6
e710324
9d0ed7b
8dc7f91
9bc6abd
be169d8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.catalog | |
|
|
||
| import java.net.URI | ||
| import java.util.Locale | ||
| import java.util.concurrent.Callable | ||
| import java.util.concurrent.{Callable, ExecutionException, TimeUnit} | ||
| import javax.annotation.concurrent.GuardedBy | ||
|
|
||
| import scala.collection.mutable | ||
|
|
@@ -31,7 +31,7 @@ import org.apache.hadoop.fs.Path | |
|
|
||
| import org.apache.spark.internal.Logging | ||
| import org.apache.spark.sql.AnalysisException | ||
| import org.apache.spark.sql.catalyst._ | ||
| import org.apache.spark.sql.catalyst.{QualifiedTableName, _} | ||
| import org.apache.spark.sql.catalyst.analysis._ | ||
| import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.FunctionBuilder | ||
| import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionInfo, ImplicitCastInputTypes} | ||
|
|
@@ -104,6 +104,12 @@ class SessionCatalog( | |
|
|
||
| private val validNameFormat = "([\\w_]+)".r | ||
|
|
||
| private val catalogTableCache = { | ||
| val expireSeconds = conf.tableCatalogCacheExpireSeconds | ||
| CacheBuilder.newBuilder().expireAfterWrite(expireSeconds, TimeUnit.SECONDS) | ||
| .build[QualifiedTableName, CatalogTable]() | ||
| } | ||
|
|
||
| /** | ||
| * Checks if the given name conforms the Hive standard ("[a-zA-Z_0-9]+"), | ||
| * i.e. if this name only contains characters, numbers, and _. | ||
|
|
@@ -222,6 +228,7 @@ class SessionCatalog( | |
| if (cascade && databaseExists(dbName)) { | ||
| listTables(dbName).foreach { t => | ||
| invalidateCachedTable(QualifiedTableName(dbName, t.table)) | ||
| invalidateCachedCatalogTable(QualifiedTableName(dbName, t.table)) | ||
| } | ||
| } | ||
| externalCatalog.dropDatabase(dbName, ignoreIfNotExists, cascade) | ||
|
|
@@ -366,6 +373,7 @@ class SessionCatalog( | |
| tableDefinition.copy(identifier = tableIdentifier) | ||
| } | ||
|
|
||
| invalidateCachedCatalogTable(QualifiedTableName(db, table)) | ||
| externalCatalog.alterTable(newTableDefinition) | ||
| } | ||
|
|
||
|
|
@@ -386,7 +394,7 @@ class SessionCatalog( | |
| requireDbExists(db) | ||
| requireTableExists(tableIdentifier) | ||
|
|
||
| val catalogTable = externalCatalog.getTable(db, table) | ||
| val catalogTable = getTableMetadata(tableIdentifier) | ||
| val oldDataSchema = catalogTable.dataSchema | ||
| // not supporting dropping columns yet | ||
| val nonExistentColumnNames = | ||
|
|
@@ -399,6 +407,7 @@ class SessionCatalog( | |
| """.stripMargin) | ||
| } | ||
|
|
||
| invalidateCachedCatalogTable(QualifiedTableName(db, table)) | ||
| externalCatalog.alterTableDataSchema(db, table, newDataSchema) | ||
| } | ||
|
|
||
|
|
@@ -416,6 +425,7 @@ class SessionCatalog( | |
| val tableIdentifier = TableIdentifier(table, Some(db)) | ||
| requireDbExists(db) | ||
| requireTableExists(tableIdentifier) | ||
| invalidateCachedCatalogTable(QualifiedTableName(db, table)) | ||
| externalCatalog.alterTableStats(db, table, newStats) | ||
| // Invalidate the table relation cache | ||
| refreshTable(identifier) | ||
|
|
@@ -428,7 +438,12 @@ class SessionCatalog( | |
| def tableExists(name: TableIdentifier): Boolean = synchronized { | ||
| val db = formatDatabaseName(name.database.getOrElse(currentDb)) | ||
| val table = formatTableName(name.table) | ||
| externalCatalog.tableExists(db, table) | ||
| val exists = externalCatalog.tableExists(db, table) | ||
| if (!exists) { | ||
| // try best to keep cached table right | ||
| invalidateCachedCatalogTable(QualifiedTableName(db, table)) | ||
| } | ||
| exists | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -440,9 +455,12 @@ class SessionCatalog( | |
| def getTableMetadata(name: TableIdentifier): CatalogTable = { | ||
| val db = formatDatabaseName(name.database.getOrElse(getCurrentDatabase)) | ||
| val table = formatTableName(name.table) | ||
| requireDbExists(db) | ||
| requireTableExists(TableIdentifier(table, Some(db))) | ||
| externalCatalog.getTable(db, table) | ||
| val qtn = QualifiedTableName(db, table) | ||
| getOrCacheCatalogTable(qtn, () => { | ||
| requireDbExists(db) | ||
| requireTableExists(TableIdentifier(table, Some(db))) | ||
| externalCatalog.getTable(db, table) | ||
| }) | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -669,6 +687,7 @@ class SessionCatalog( | |
| requireTableNotExists(TableIdentifier(newTableName, Some(db))) | ||
| validateName(newTableName) | ||
| validateNewLocationOfRename(oldName, newName) | ||
| invalidateCachedCatalogTable(QualifiedTableName(db, oldTableName)) | ||
| externalCatalog.renameTable(db, oldTableName, newTableName) | ||
| } else { | ||
| if (newName.database.isDefined) { | ||
|
|
@@ -711,6 +730,7 @@ class SessionCatalog( | |
| // When ignoreIfNotExists is false, no exception is issued when the table does not exist. | ||
| // Instead, log it as an error message. | ||
| if (tableExists(TableIdentifier(table, Option(db)))) { | ||
| invalidateCachedCatalogTable(QualifiedTableName(db, table)) | ||
| externalCatalog.dropTable(db, table, ignoreIfNotExists = true, purge = purge) | ||
| } else if (!ignoreIfNotExists) { | ||
| throw new NoSuchTableException(db = db, table = table) | ||
|
|
@@ -872,6 +892,8 @@ class SessionCatalog( | |
| // Also invalidate the table relation cache. | ||
| val qualifiedTableName = QualifiedTableName(dbName, tableName) | ||
| tableRelationCache.invalidate(qualifiedTableName) | ||
|
|
||
| invalidateCachedCatalogTable(qualifiedTableName) | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -908,6 +930,7 @@ class SessionCatalog( | |
| requireTableExists(TableIdentifier(table, Option(db))) | ||
| requireExactMatchedPartitionSpec(parts.map(_.spec), getTableMetadata(tableName)) | ||
| requireNonEmptyValueInPartitionSpec(parts.map(_.spec)) | ||
| invalidateCachedCatalogTable(QualifiedTableName(db, table)) | ||
| externalCatalog.createPartitions( | ||
| db, table, partitionWithQualifiedPath(tableName, parts), ignoreIfExists) | ||
| } | ||
|
|
@@ -928,6 +951,7 @@ class SessionCatalog( | |
| requireTableExists(TableIdentifier(table, Option(db))) | ||
| requirePartialMatchedPartitionSpec(specs, getTableMetadata(tableName)) | ||
| requireNonEmptyValueInPartitionSpec(specs) | ||
| invalidateCachedCatalogTable(QualifiedTableName(db, table)) | ||
| externalCatalog.dropPartitions(db, table, specs, ignoreIfNotExists, purge, retainData) | ||
| } | ||
|
|
||
|
|
@@ -950,6 +974,7 @@ class SessionCatalog( | |
| requireExactMatchedPartitionSpec(newSpecs, tableMetadata) | ||
| requireNonEmptyValueInPartitionSpec(specs) | ||
| requireNonEmptyValueInPartitionSpec(newSpecs) | ||
| invalidateCachedCatalogTable(QualifiedTableName(db, table)) | ||
| externalCatalog.renamePartitions(db, table, specs, newSpecs) | ||
| } | ||
|
|
||
|
|
@@ -969,6 +994,7 @@ class SessionCatalog( | |
| requireTableExists(TableIdentifier(table, Option(db))) | ||
| requireExactMatchedPartitionSpec(parts.map(_.spec), getTableMetadata(tableName)) | ||
| requireNonEmptyValueInPartitionSpec(parts.map(_.spec)) | ||
| invalidateCachedCatalogTable(QualifiedTableName(db, table)) | ||
| externalCatalog.alterPartitions(db, table, partitionWithQualifiedPath(tableName, parts)) | ||
| } | ||
|
|
||
|
|
@@ -1484,6 +1510,41 @@ class SessionCatalog( | |
| require(functionBuilder.isDefined, s"built-in function '$f' is missing function builder") | ||
| functionRegistry.registerFunction(f, expressionInfo.get, functionBuilder.get) | ||
| } | ||
| invalidateAllCachedCatalogTables() | ||
| } | ||
|
|
||
| private[sql] def getCachedCatalogTable(qtn: QualifiedTableName): Option[CatalogTable] = { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add comments. Other new methods as well. |
||
| catalogTableCache.getIfPresent(qtn) match { | ||
| case null => None | ||
| case catalogTable => Some(catalogTable) | ||
| } | ||
| } | ||
|
|
||
| private[sql] def getOrCacheCatalogTable( | ||
| qtn: QualifiedTableName, | ||
| init: Callable[CatalogTable]): CatalogTable = { | ||
| try { | ||
| catalogTableCache.get(qtn, init) | ||
| } catch { | ||
| case e: ExecutionException => | ||
| // unpack ExecutionException to raw Exception | ||
| throw e.getCause | ||
| case other: Throwable => | ||
| // unexpected exception, should never happen | ||
| throw other | ||
| } | ||
| } | ||
|
|
||
| private[sql] def cacheCatalogTable(qtn: QualifiedTableName, catalogTable: CatalogTable): Unit = { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Caches should pass a Callable so that populating the cache can be combined with a get operation (get or initialize). Instead of
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK, fix these. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here |
||
| catalogTableCache.put(qtn, catalogTable) | ||
| } | ||
|
|
||
| private[sql] def invalidateAllCachedCatalogTables(): Unit = { | ||
| catalogTableCache.cleanUp() | ||
| } | ||
|
|
||
| private[sql] def invalidateCachedCatalogTable(qtn: QualifiedTableName): Unit = { | ||
| catalogTableCache.invalidate(qtn) | ||
| } | ||
|
|
||
| /** | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These two methods
invalidateCachedTableandinvalidateCachedCatalogTableare really confusing in their names. I suggest to introduce some rewording to let the names more intuitive.