From f6f0217d573b09d801546c9264464eee8b13c188 Mon Sep 17 00:00:00 2001 From: Vlad Glinsky Date: Fri, 18 Dec 2020 13:36:18 +0200 Subject: [PATCH 1/2] [SPARK-33841][CORE] Jobs disappear intermittently from the SHS under high load --- .../deploy/history/FsHistoryProvider.scala | 22 ++++++++++++++----- 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala index 7e63d5500b2ab..0d63fb46427b8 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala @@ -460,9 +460,17 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) val newLastScanTime = clock.getTimeMillis() logDebug(s"Scanning $logDir with lastScanTime==$lastScanTime") + val notStale = mutable.HashSet[String]() val updated = Option(fs.listStatus(new Path(logDir))).map(_.toSeq).getOrElse(Nil) .filter { entry => !isBlacklisted(entry.getPath) } - .filter { entry => !isProcessing(entry.getPath) } + .filter { entry => + if (isProcessing(entry.getPath)) { + notStale.add(entry.getPath.toString()) + false + } else { + true + } + } .flatMap { entry => EventLogFileReader(fs, entry) } .filter { reader => try { @@ -562,12 +570,14 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) .last(newLastScanTime - 1) .asScala .toList - stale.filterNot(isProcessing).foreach { log => - log.appId.foreach { appId => - cleanAppData(appId, log.attemptId, log.logPath) - listing.delete(classOf[LogInfo], log.logPath) + stale.filterNot(isProcessing) + .filterNot(info => notStale.contains(info.logPath)) + .foreach { log => + log.appId.foreach { appId => + cleanAppData(appId, log.attemptId, log.logPath) + listing.delete(classOf[LogInfo], log.logPath) + } } - } lastScanTime.set(newLastScanTime) } catch { From c2b8c2d24ffba385ad345f838b0a9ce1e562d93b Mon Sep 17 00:00:00 2001 From: Vlad Glinsky Date: Fri, 18 Dec 2020 18:23:29 +0200 Subject: [PATCH 2/2] Address review comments --- .../org/apache/spark/deploy/history/FsHistoryProvider.scala | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala index 0d63fb46427b8..b31333fc48218 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala @@ -460,6 +460,10 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) val newLastScanTime = clock.getTimeMillis() logDebug(s"Scanning $logDir with lastScanTime==$lastScanTime") + // Mark entries that are processing as not stale. Such entries do not have a chance to be + // updated with the new 'lastProcessed' time and thus any entity that completes processing + // right after this check and before the check for stale entities will be identified as stale + // and will be deleted from the UI until the next 'checkForLogs' run. val notStale = mutable.HashSet[String]() val updated = Option(fs.listStatus(new Path(logDir))).map(_.toSeq).getOrElse(Nil) .filter { entry => !isBlacklisted(entry.getPath) }